diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target')
191 files changed, 5189 insertions, 2257 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h index 901769c54b6e..d20ef63a72e8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.h @@ -88,6 +88,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);  void initializeAArch64ExpandPseudoPass(PassRegistry &);  void initializeAArch64GlobalsTaggingPass(PassRegistry &);  void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &);  void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);  void initializeAArch64MIPeepholeOptPass(PassRegistry &);  void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td index 68f452039c9b..d5e8ed101d1c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64.td @@ -1405,7 +1405,7 @@ def ProcessorFeatures {                                   FeatureSSBS];    list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,                                   FeatureNEON, FeatureFullFP16, FeatureDotProd, -                                 FeatureFlagM, FeatureFP16FML, FeaturePAuth, +                                 FeatureFlagM, FeaturePAuth,                                   FeaturePerfMon, FeatureRCPC, FeatureSPE,                                   FeatureSSBS];    list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 9b8162ce8dd4..e98f6c4984a7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -645,7 +645,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();          TmpOffset += SL->getElementOffset(Idx);        } else { -        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); +        uint64_t S = GTI.getSequentialElementStride(DL);          while (true) {            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {              // Constant-offset addressing. @@ -1231,15 +1231,6 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,    // Only extend the RHS within the instruction if there is a valid extend type.    if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&        isValueAvailable(RHS)) { -    if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) -      if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) -        if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) { -          Register RHSReg = getRegForValue(SI->getOperand(0)); -          if (!RHSReg) -            return 0; -          return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType, -                               C->getZExtValue(), SetFlags, WantResult); -        }      Register RHSReg = getRegForValue(RHS);      if (!RHSReg)        return 0; @@ -4987,15 +4978,13 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {        if (Field)          TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);      } else { -      Type *Ty = GTI.getIndexedType(); -        // If this is a constant subscript, handle it quickly.        if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {          if (CI->isZero())            continue;          // N = N + Offset -        TotalOffs += -            DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue(); +        TotalOffs += GTI.getSequentialElementStride(DL) * +                     cast<ConstantInt>(CI)->getSExtValue();          continue;        }        if (TotalOffs) { @@ -5006,7 +4995,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {        }        // N = N + Idx * ElementSize; -      uint64_t ElementSize = DL.getTypeAllocSize(Ty); +      uint64_t ElementSize = GTI.getSequentialElementStride(DL);        unsigned IdxN = getRegForGEPIndex(Idx);        if (!IdxN)          return false; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 476d99c2a7e0..edc8cc7d4d1e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -580,7 +580,7 @@ bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,    if (!isa<ConstantSDNode>(N.getNode()))      return false; -  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue(); +  uint64_t Immed = N.getNode()->getAsZExtVal();    unsigned ShiftAmt;    if (Immed >> 12 == 0) { @@ -611,7 +611,7 @@ bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,      return false;    // The immediate operand must be a 24-bit zero-extended immediate. -  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue(); +  uint64_t Immed = N.getNode()->getAsZExtVal();    // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"    // have the opposite effect on the C flag, so this pattern mustn't match under @@ -1326,7 +1326,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,    //     MOV  X0, WideImmediate    //     LDR  X2, [BaseReg, X0]    if (isa<ConstantSDNode>(RHS)) { -    int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue(); +    int64_t ImmOff = (int64_t)RHS->getAsZExtVal();      // Skip the immediate can be selected by load/store addressing mode.      // Also skip the immediate can be encoded by a single ADD (SUB is also      // checked by using -ImmOff). diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 102fd0c3dae2..47e665176e8b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3588,8 +3588,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,    //    cmp     w13, w12    // can be turned into:    //    cmp     w12, w11, lsl #1 -  if (!isa<ConstantSDNode>(RHS) || -      !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) { +  if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) {      SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;      if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { @@ -3623,7 +3622,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,          cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&          cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&          LHS.getNode()->hasNUsesOfValue(1, 0)) { -      int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); +      int16_t ValueofRHS = RHS->getAsZExtVal();        if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {          SDValue SExt =              DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, @@ -5619,7 +5618,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,    // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else    // must be calculated before hand. -  uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); +  uint64_t ScaleVal = Scale->getAsZExtVal();    if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {      assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");      EVT IndexVT = Index.getValueType(); @@ -5707,7 +5706,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,    // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else    // must be calculated before hand. -  uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); +  uint64_t ScaleVal = Scale->getAsZExtVal();    if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {      assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");      EVT IndexVT = Index.getValueType(); @@ -16516,9 +16515,9 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {      if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=          VT.getVectorNumElements() * 2)        return SDValue(); -    if ((Ext0.getConstantOperandVal(1) != 0 && +    if ((Ext0.getConstantOperandVal(1) != 0 ||           Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) && -        (Ext1.getConstantOperandVal(1) != 0 && +        (Ext1.getConstantOperandVal(1) != 0 ||           Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))        return SDValue();      unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP @@ -22011,7 +22010,7 @@ static SDValue performBRCONDCombine(SDNode *N,    SDValue Cmp = N->getOperand(3);    assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); -  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); +  unsigned CC = CCVal->getAsZExtVal();    if (CC != AArch64CC::EQ && CC != AArch64CC::NE)      return SDValue(); diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td index cb63d8726744..10ad5b1f8f25 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -12586,6 +12586,7 @@ def : TokenAlias<".4S", ".4s">;  def : TokenAlias<".2D", ".2d">;  def : TokenAlias<".1Q", ".1q">;  def : TokenAlias<".2H", ".2h">; +def : TokenAlias<".2B", ".2b">;  def : TokenAlias<".B", ".b">;  def : TokenAlias<".H", ".h">;  def : TokenAlias<".S", ".s">; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp new file mode 100644 index 000000000000..6fcd9c290e9c --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp @@ -0,0 +1,828 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +//  while (++i != n) { +//    if (a[i] != b[i]) +//      break; +//  } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===----------------------------------------------------------------------===// +// +// TODO List: +// +// * Add support for the inverse case where we scan for a matching element. +// * Permit 64-bit induction variable types. +// * Recognize loops that increment the IV *after* comparing bytes. +// * Allow 32-bit sign-extends of the IV used by the GEP. +// +//===----------------------------------------------------------------------===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt<bool> +    DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(true), +               cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt<bool> DisableByteCmp( +    "disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), +    cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " +             "not convert byte-compare loop(s).")); + +static cl::opt<bool> VerifyLoops( +    "aarch64-lit-verify", cl::Hidden, cl::init(false), +    cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { +  Loop *CurLoop = nullptr; +  DominatorTree *DT; +  LoopInfo *LI; +  const TargetTransformInfo *TTI; +  const DataLayout *DL; + +public: +  explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, +                                     const TargetTransformInfo *TTI, +                                     const DataLayout *DL) +      : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + +  bool run(Loop *L); + +private: +  /// \name Countable Loop Idiom Handling +  /// @{ + +  bool runOnCountableLoop(); +  bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, +                      SmallVectorImpl<BasicBlock *> &ExitBlocks); + +  bool recognizeByteCompare(); +  Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, +                            GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +                            Instruction *Index, Value *Start, Value *MaxLen); +  void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, +                            PHINode *IndPhi, Value *MaxLen, Instruction *Index, +                            Value *Start, bool IncIdx, BasicBlock *FoundBB, +                            BasicBlock *EndBB); +  /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: +  static char ID; + +  explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { +    initializeAArch64LoopIdiomTransformLegacyPassPass( +        *PassRegistry::getPassRegistry()); +  } + +  StringRef getPassName() const override { +    return "Transform AArch64-specific loop idioms"; +  } + +  void getAnalysisUsage(AnalysisUsage &AU) const override { +    AU.addRequired<LoopInfoWrapperPass>(); +    AU.addRequired<DominatorTreeWrapperPass>(); +    AU.addRequired<TargetTransformInfoWrapperPass>(); +  } + +  bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, +                                                    LPPassManager &LPM) { + +  if (skipLoop(L)) +    return false; + +  auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); +  auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); +  auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( +      *L->getHeader()->getParent()); +  return AArch64LoopIdiomTransform( +             DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) +      .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN( +    AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", +    "Transform specific loop idioms into optimized vector forms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END( +    AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", +    "Transform specific loop idioms into optimized vector forms", false, false) + +Pass *llvm::createAArch64LoopIdiomTransformPass() { +  return new AArch64LoopIdiomTransformLegacyPass(); +} + +PreservedAnalyses +AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, +                                   LoopStandardAnalysisResults &AR, +                                   LPMUpdater &) { +  if (DisableAll) +    return PreservedAnalyses::all(); + +  const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + +  AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); +  if (!LIT.run(&L)) +    return PreservedAnalyses::all(); + +  return PreservedAnalyses::none(); +} + +//===----------------------------------------------------------------------===// +// +//          Implementation of AArch64LoopIdiomTransform +// +//===----------------------------------------------------------------------===// + +bool AArch64LoopIdiomTransform::run(Loop *L) { +  CurLoop = L; + +  if (DisableAll || L->getHeader()->getParent()->hasOptSize()) +    return false; + +  // If the loop could not be converted to canonical form, it must have an +  // indirectbr in it, just give up. +  if (!L->getLoopPreheader()) +    return false; + +  LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" +                    << CurLoop->getHeader()->getParent()->getName() +                    << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + +  return recognizeByteCompare(); +} + +bool AArch64LoopIdiomTransform::recognizeByteCompare() { +  // Currently the transformation only works on scalable vector types, although +  // there is no fundamental reason why it cannot be made to work for fixed +  // width too. + +  // We also need to know the minimum page size for the target in order to +  // generate runtime memory checks to ensure the vector version won't fault. +  if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || +      DisableByteCmp) +    return false; + +  BasicBlock *Header = CurLoop->getHeader(); + +  // In AArch64LoopIdiomTransform::run we have already checked that the loop +  // has a preheader so we can assume it's in a canonical form. +  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) +    return false; + +  PHINode *PN = dyn_cast<PHINode>(&Header->front()); +  if (!PN || PN->getNumIncomingValues() != 2) +    return false; + +  auto LoopBlocks = CurLoop->getBlocks(); +  // The first block in the loop should contain only 4 instructions, e.g. +  // +  //  while.cond: +  //   %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ] +  //   %inc = add i32 %res.phi, 1 +  //   %cmp.not = icmp eq i32 %inc, %n +  //   br i1 %cmp.not, label %while.end, label %while.body +  // +  auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug(); +  if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) > 4) +    return false; + +  // The second block should contain 7 instructions, e.g. +  // +  // while.body: +  //   %idx = zext i32 %inc to i64 +  //   %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx +  //   %load.a = load i8, ptr %idx.a +  //   %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx +  //   %load.b = load i8, ptr %idx.b +  //   %cmp.not.ld = icmp eq i8 %load.a, %load.b +  //   br i1 %cmp.not.ld, label %while.cond, label %while.end +  // +  auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug(); +  if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) > 7) +    return false; + +  // The incoming value to the PHI node from the loop should be an add of 1. +  Value *StartIdx = nullptr; +  Instruction *Index = nullptr; +  if (!CurLoop->contains(PN->getIncomingBlock(0))) { +    StartIdx = PN->getIncomingValue(0); +    Index = dyn_cast<Instruction>(PN->getIncomingValue(1)); +  } else { +    StartIdx = PN->getIncomingValue(1); +    Index = dyn_cast<Instruction>(PN->getIncomingValue(0)); +  } + +  // Limit to 32-bit types for now +  if (!Index || !Index->getType()->isIntegerTy(32) || +      !match(Index, m_c_Add(m_Specific(PN), m_One()))) +    return false; + +  // If we match the pattern, PN and Index will be replaced with the result of +  // the cttz.elts intrinsic. If any other instructions are used outside of +  // the loop, we cannot replace it. +  for (BasicBlock *BB : LoopBlocks) +    for (Instruction &I : *BB) +      if (&I != PN && &I != Index) +        for (User *U : I.users()) +          if (!CurLoop->contains(cast<Instruction>(U))) +            return false; + +  // Match the branch instruction for the header +  ICmpInst::Predicate Pred; +  Value *MaxLen; +  BasicBlock *EndBB, *WhileBB; +  if (!match(Header->getTerminator(), +             m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)), +                  m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) || +      Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB)) +    return false; + +  // WhileBB should contain the pattern of load & compare instructions. Match +  // the pattern and find the GEP instructions used by the loads. +  ICmpInst::Predicate WhilePred; +  BasicBlock *FoundBB; +  BasicBlock *TrueBB; +  Value *LoadA, *LoadB; +  if (!match(WhileBB->getTerminator(), +             m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)), +                  m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) || +      WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB)) +    return false; + +  Value *A, *B; +  if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B)))) +    return false; + +  LoadInst *LoadAI = cast<LoadInst>(LoadA); +  LoadInst *LoadBI = cast<LoadInst>(LoadB); +  if (!LoadAI->isSimple() || !LoadBI->isSimple()) +    return false; + +  GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(A); +  GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(B); + +  if (!GEPA || !GEPB) +    return false; + +  Value *PtrA = GEPA->getPointerOperand(); +  Value *PtrB = GEPB->getPointerOperand(); + +  // Check we are loading i8 values from two loop invariant pointers +  if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) || +      !GEPA->getResultElementType()->isIntegerTy(8) || +      !GEPB->getResultElementType()->isIntegerTy(8) || +      !LoadAI->getType()->isIntegerTy(8) || +      !LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB) +    return false; + +  // Check that the index to the GEPs is the index we found earlier +  if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1) +    return false; + +  Value *IdxA = GEPA->getOperand(GEPA->getNumIndices()); +  Value *IdxB = GEPB->getOperand(GEPB->getNumIndices()); +  if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index)))) +    return false; + +  // We only ever expect the pre-incremented index value to be used inside the +  // loop. +  if (!PN->hasOneUse()) +    return false; + +  // Ensure that when the Found and End blocks are identical the PHIs have the +  // supported format. We don't currently allow cases like this: +  // while.cond: +  //   ... +  //   br i1 %cmp.not, label %while.end, label %while.body +  // +  // while.body: +  //   ... +  //   br i1 %cmp.not2, label %while.cond, label %while.end +  // +  // while.end: +  //   %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] +  // +  // Where the incoming values for %final_ptr are unique and from each of the +  // loop blocks, but not actually defined in the loop. This requires extra +  // work setting up the byte.compare block, i.e. by introducing a select to +  // choose the correct value. +  // TODO: We could add support for this in future. +  if (FoundBB == EndBB) { +    for (PHINode &EndPN : EndBB->phis()) { +      Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header); +      Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB); + +      // The value of the index when leaving the while.cond block is always the +      // same as the end value (MaxLen) so we permit either. Otherwise for any +      // other value defined outside the loop we only allow values that are the +      // same as the exit value for while.body. +      if (WhileCondVal != Index && WhileCondVal != MaxLen && +          WhileCondVal != WhileBodyVal) +        return false; +    } +  } + +  LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" +                    << *(EndBB->getParent()) << "\n\n"); + +  // The index is incremented before the GEP/Load pair so we need to +  // add 1 to the start value. +  transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true, +                       FoundBB, EndBB); +  return true; +} + +Value *AArch64LoopIdiomTransform::expandFindMismatch( +    IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, +    GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { +  Value *PtrA = GEPA->getPointerOperand(); +  Value *PtrB = GEPB->getPointerOperand(); + +  // Get the arguments and types for the intrinsic. +  BasicBlock *Preheader = CurLoop->getLoopPreheader(); +  BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator()); +  LLVMContext &Ctx = PHBranch->getContext(); +  Type *LoadType = Type::getInt8Ty(Ctx); +  Type *ResType = Builder.getInt32Ty(); + +  // Split block in the original loop preheader. +  BasicBlock *EndBlock = +      SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); + +  // Create the blocks that we're going to need: +  //  1. A block for checking the zero-extended length exceeds 0 +  //  2. A block to check that the start and end addresses of a given array +  //     lie on the same page. +  //  3. The SVE loop preheader. +  //  4. The first SVE loop block. +  //  5. The SVE loop increment block. +  //  6. A block we can jump to from the SVE loop when a mismatch is found. +  //  7. The first block of the scalar loop itself, containing PHIs , loads +  //  and cmp. +  //  8. A scalar loop increment block to increment the PHIs and go back +  //  around the loop. + +  BasicBlock *MinItCheckBlock = BasicBlock::Create( +      Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock); + +  // Update the terminator added by SplitBlock to branch to the first block +  Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock); + +  BasicBlock *MemCheckBlock = BasicBlock::Create( +      Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock); + +  BasicBlock *SVELoopPreheaderBlock = BasicBlock::Create( +      Ctx, "mismatch_sve_loop_preheader", EndBlock->getParent(), EndBlock); + +  BasicBlock *SVELoopStartBlock = BasicBlock::Create( +      Ctx, "mismatch_sve_loop", EndBlock->getParent(), EndBlock); + +  BasicBlock *SVELoopIncBlock = BasicBlock::Create( +      Ctx, "mismatch_sve_loop_inc", EndBlock->getParent(), EndBlock); + +  BasicBlock *SVELoopMismatchBlock = BasicBlock::Create( +      Ctx, "mismatch_sve_loop_found", EndBlock->getParent(), EndBlock); + +  BasicBlock *LoopPreHeaderBlock = BasicBlock::Create( +      Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock); + +  BasicBlock *LoopStartBlock = +      BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock); + +  BasicBlock *LoopIncBlock = BasicBlock::Create( +      Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock); + +  DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock}, +                    {DominatorTree::Delete, Preheader, EndBlock}}); + +  // Update LoopInfo with the new SVE & scalar loops. +  auto SVELoop = LI->AllocateLoop(); +  auto ScalarLoop = LI->AllocateLoop(); + +  if (CurLoop->getParentLoop()) { +    CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI); +    CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI); +    CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopPreheaderBlock, *LI); +    CurLoop->getParentLoop()->addChildLoop(SVELoop); +    CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopMismatchBlock, *LI); +    CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI); +    CurLoop->getParentLoop()->addChildLoop(ScalarLoop); +  } else { +    LI->addTopLevelLoop(SVELoop); +    LI->addTopLevelLoop(ScalarLoop); +  } + +  // Add the new basic blocks to their associated loops. +  SVELoop->addBasicBlockToLoop(SVELoopStartBlock, *LI); +  SVELoop->addBasicBlockToLoop(SVELoopIncBlock, *LI); + +  ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI); +  ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI); + +  // Set up some types and constants that we intend to reuse. +  Type *I64Type = Builder.getInt64Ty(); + +  // Check the zero-extended iteration count > 0 +  Builder.SetInsertPoint(MinItCheckBlock); +  Value *ExtStart = Builder.CreateZExt(Start, I64Type); +  Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type); +  // This check doesn't really cost us very much. + +  Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen); +  BranchInst *MinItCheckBr = +      BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck); +  MinItCheckBr->setMetadata( +      LLVMContext::MD_prof, +      MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1)); +  Builder.Insert(MinItCheckBr); + +  DTU.applyUpdates( +      {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, +       {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); + +  // For each of the arrays, check the start/end addresses are on the same +  // page. +  Builder.SetInsertPoint(MemCheckBlock); + +  // The early exit in the original loop means that when performing vector +  // loads we are potentially reading ahead of the early exit. So we could +  // fault if crossing a page boundary. Therefore, we create runtime memory +  // checks based on the minimum page size as follows: +  //   1. Calculate the addresses of the first memory accesses in the loop, +  //      i.e. LhsStart and RhsStart. +  //   2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd. +  //   3. Determine which pages correspond to all the memory accesses, i.e +  //      LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage. +  //   4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then +  //      we know we won't cross any page boundaries in the loop so we can +  //      enter the vector loop! Otherwise we fall back on the scalar loop. +  Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart); +  Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart); +  Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type); +  Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type); +  Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd); +  Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd); +  Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type); +  Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type); + +  const uint64_t MinPageSize = TTI->getMinPageSize().value(); +  const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize); +  Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt); +  Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt); +  Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt); +  Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt); +  Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage); +  Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage); + +  Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp); +  BranchInst *CombinedPageCmpCmpBr = BranchInst::Create( +      LoopPreHeaderBlock, SVELoopPreheaderBlock, CombinedPageCmp); +  CombinedPageCmpCmpBr->setMetadata( +      LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext()) +                                .createBranchWeights(10, 90)); +  Builder.Insert(CombinedPageCmpCmpBr); + +  DTU.applyUpdates( +      {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, +       {DominatorTree::Insert, MemCheckBlock, SVELoopPreheaderBlock}}); + +  // Set up the SVE loop preheader, i.e. calculate initial loop predicate, +  // zero-extend MaxLen to 64-bits, determine the number of vector elements +  // processed in each iteration, etc. +  Builder.SetInsertPoint(SVELoopPreheaderBlock); + +  // At this point we know two things must be true: +  //  1. Start <= End +  //  2. ExtMaxLen <= MinPageSize due to the page checks. +  // Therefore, we know that we can use a 64-bit induction variable that +  // starts from 0 -> ExtMaxLen and it will not overflow. +  ScalableVectorType *PredVTy = +      ScalableVectorType::get(Builder.getInt1Ty(), 16); + +  Value *InitialPred = Builder.CreateIntrinsic( +      Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); + +  Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); +  VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "", +                             /*HasNUW=*/true, /*HasNSW=*/true); + +  Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), +                                            Builder.getInt1(false)); + +  BranchInst *JumpToSVELoop = BranchInst::Create(SVELoopStartBlock); +  Builder.Insert(JumpToSVELoop); + +  DTU.applyUpdates( +      {{DominatorTree::Insert, SVELoopPreheaderBlock, SVELoopStartBlock}}); + +  // Set up the first SVE loop block by creating the PHIs, doing the vector +  // loads and comparing the vectors. +  Builder.SetInsertPoint(SVELoopStartBlock); +  PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_sve_loop_pred"); +  LoopPred->addIncoming(InitialPred, SVELoopPreheaderBlock); +  PHINode *SVEIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_sve_index"); +  SVEIndexPhi->addIncoming(ExtStart, SVELoopPreheaderBlock); +  Type *SVELoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16); +  Value *Passthru = ConstantInt::getNullValue(SVELoadType); + +  Value *SVELhsGep = Builder.CreateGEP(LoadType, PtrA, SVEIndexPhi); +  if (GEPA->isInBounds()) +    cast<GetElementPtrInst>(SVELhsGep)->setIsInBounds(true); +  Value *SVELhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVELhsGep, Align(1), +                                               LoopPred, Passthru); + +  Value *SVERhsGep = Builder.CreateGEP(LoadType, PtrB, SVEIndexPhi); +  if (GEPB->isInBounds()) +    cast<GetElementPtrInst>(SVERhsGep)->setIsInBounds(true); +  Value *SVERhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVERhsGep, Align(1), +                                               LoopPred, Passthru); + +  Value *SVEMatchCmp = Builder.CreateICmpNE(SVELhsLoad, SVERhsLoad); +  SVEMatchCmp = Builder.CreateSelect(LoopPred, SVEMatchCmp, PFalse); +  Value *SVEMatchHasActiveLanes = Builder.CreateOrReduce(SVEMatchCmp); +  BranchInst *SVEEarlyExit = BranchInst::Create( +      SVELoopMismatchBlock, SVELoopIncBlock, SVEMatchHasActiveLanes); +  Builder.Insert(SVEEarlyExit); + +  DTU.applyUpdates( +      {{DominatorTree::Insert, SVELoopStartBlock, SVELoopMismatchBlock}, +       {DominatorTree::Insert, SVELoopStartBlock, SVELoopIncBlock}}); + +  // Increment the index counter and calculate the predicate for the next +  // iteration of the loop. We branch back to the start of the loop if there +  // is at least one active lane. +  Builder.SetInsertPoint(SVELoopIncBlock); +  Value *NewSVEIndexPhi = Builder.CreateAdd(SVEIndexPhi, VecLen, "", +                                            /*HasNUW=*/true, /*HasNSW=*/true); +  SVEIndexPhi->addIncoming(NewSVEIndexPhi, SVELoopIncBlock); +  Value *NewPred = +      Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, +                              {PredVTy, I64Type}, {NewSVEIndexPhi, ExtEnd}); +  LoopPred->addIncoming(NewPred, SVELoopIncBlock); + +  Value *PredHasActiveLanes = +      Builder.CreateExtractElement(NewPred, uint64_t(0)); +  BranchInst *SVELoopBranchBack = +      BranchInst::Create(SVELoopStartBlock, EndBlock, PredHasActiveLanes); +  Builder.Insert(SVELoopBranchBack); + +  DTU.applyUpdates({{DominatorTree::Insert, SVELoopIncBlock, SVELoopStartBlock}, +                    {DominatorTree::Insert, SVELoopIncBlock, EndBlock}}); + +  // If we found a mismatch then we need to calculate which lane in the vector +  // had a mismatch and add that on to the current loop index. +  Builder.SetInsertPoint(SVELoopMismatchBlock); +  PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_sve_found_pred"); +  FoundPred->addIncoming(SVEMatchCmp, SVELoopStartBlock); +  PHINode *LastLoopPred = +      Builder.CreatePHI(PredVTy, 1, "mismatch_sve_last_loop_pred"); +  LastLoopPred->addIncoming(LoopPred, SVELoopStartBlock); +  PHINode *SVEFoundIndex = +      Builder.CreatePHI(I64Type, 1, "mismatch_sve_found_index"); +  SVEFoundIndex->addIncoming(SVEIndexPhi, SVELoopStartBlock); + +  Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred); +  Value *Ctz = Builder.CreateIntrinsic( +      Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, +      {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); +  Ctz = Builder.CreateZExt(Ctz, I64Type); +  Value *SVELoopRes64 = Builder.CreateAdd(SVEFoundIndex, Ctz, "", +                                          /*HasNUW=*/true, /*HasNSW=*/true); +  Value *SVELoopRes = Builder.CreateTrunc(SVELoopRes64, ResType); + +  Builder.Insert(BranchInst::Create(EndBlock)); + +  DTU.applyUpdates({{DominatorTree::Insert, SVELoopMismatchBlock, EndBlock}}); + +  // Generate code for scalar loop. +  Builder.SetInsertPoint(LoopPreHeaderBlock); +  Builder.Insert(BranchInst::Create(LoopStartBlock)); + +  DTU.applyUpdates( +      {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); + +  Builder.SetInsertPoint(LoopStartBlock); +  PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index"); +  IndexPhi->addIncoming(Start, LoopPreHeaderBlock); + +  // Otherwise compare the values +  // Load bytes from each array and compare them. +  Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type); + +  Value *LhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset); +  if (GEPA->isInBounds()) +    cast<GetElementPtrInst>(LhsGep)->setIsInBounds(true); +  Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep); + +  Value *RhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset); +  if (GEPB->isInBounds()) +    cast<GetElementPtrInst>(RhsGep)->setIsInBounds(true); +  Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep); + +  Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad); +  // If we have a mismatch then exit the loop ... +  BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp); +  Builder.Insert(MatchCmpBr); + +  DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, +                    {DominatorTree::Insert, LoopStartBlock, EndBlock}}); + +  // Have we reached the maximum permitted length for the loop? +  Builder.SetInsertPoint(LoopIncBlock); +  Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "", +                                    /*HasNUW=*/Index->hasNoUnsignedWrap(), +                                    /*HasNSW=*/Index->hasNoSignedWrap()); +  IndexPhi->addIncoming(PhiInc, LoopIncBlock); +  Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen); +  BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp); +  Builder.Insert(IVCmpBr); + +  DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock}, +                    {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); + +  // In the end block we need to insert a PHI node to deal with three cases: +  //  1. We didn't find a mismatch in the scalar loop, so we return MaxLen. +  //  2. We exitted the scalar loop early due to a mismatch and need to return +  //  the index that we found. +  //  3. We didn't find a mismatch in the SVE loop, so we return MaxLen. +  //  4. We exitted the SVE loop early due to a mismatch and need to return +  //  the index that we found. +  Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt()); +  PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result"); +  ResPhi->addIncoming(MaxLen, LoopIncBlock); +  ResPhi->addIncoming(IndexPhi, LoopStartBlock); +  ResPhi->addIncoming(MaxLen, SVELoopIncBlock); +  ResPhi->addIncoming(SVELoopRes, SVELoopMismatchBlock); + +  Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType); + +  if (VerifyLoops) { +    ScalarLoop->verifyLoop(); +    SVELoop->verifyLoop(); +    if (!SVELoop->isRecursivelyLCSSAForm(*DT, *LI)) +      report_fatal_error("Loops must remain in LCSSA form!"); +    if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI)) +      report_fatal_error("Loops must remain in LCSSA form!"); +  } + +  return FinalRes; +} + +void AArch64LoopIdiomTransform::transformByteCompare( +    GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, +    Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, +    BasicBlock *FoundBB, BasicBlock *EndBB) { + +  // Insert the byte compare code at the end of the preheader block +  BasicBlock *Preheader = CurLoop->getLoopPreheader(); +  BasicBlock *Header = CurLoop->getHeader(); +  BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator()); +  IRBuilder<> Builder(PHBranch); +  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); +  Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); + +  // Increment the pointer if this was done before the loads in the loop. +  if (IncIdx) +    Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1)); + +  Value *ByteCmpRes = +      expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen); + +  // Replaces uses of index & induction Phi with intrinsic (we already +  // checked that the the first instruction of Header is the Phi above). +  assert(IndPhi->hasOneUse() && "Index phi node has more than one use!"); +  Index->replaceAllUsesWith(ByteCmpRes); + +  assert(PHBranch->isUnconditional() && +         "Expected preheader to terminate with an unconditional branch."); + +  // If no mismatch was found, we can jump to the end block. Create a +  // new basic block for the compare instruction. +  auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare", +                                   Preheader->getParent()); +  CmpBB->moveBefore(EndBB); + +  // Replace the branch in the preheader with an always-true conditional branch. +  // This ensures there is still a reference to the original loop. +  Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header); +  PHBranch->eraseFromParent(); + +  BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent(); +  DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}}); + +  // Create the branch to either the end or found block depending on the value +  // returned by the intrinsic. +  Builder.SetInsertPoint(CmpBB); +  if (FoundBB != EndBB) { +    Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen); +    Builder.CreateCondBr(FoundCmp, EndBB, FoundBB); +    DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}, +                      {DominatorTree::Insert, CmpBB, EndBB}}); + +  } else { +    Builder.CreateBr(FoundBB); +    DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); +  } + +  auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { +    for (PHINode &PN : SuccBB->phis()) { +      // At this point we've already replaced all uses of the result from the +      // loop with ByteCmp. Look through the incoming values to find ByteCmp, +      // meaning this is a Phi collecting the results of the byte compare. +      bool ResPhi = false; +      for (Value *Op : PN.incoming_values()) +        if (Op == ByteCmpRes) { +          ResPhi = true; +          break; +        } + +      // Any PHI that depended upon the result of the byte compare needs a new +      // incoming value from CmpBB. This is because the original loop will get +      // deleted. +      if (ResPhi) +        PN.addIncoming(ByteCmpRes, CmpBB); +      else { +        // There should be no other outside uses of other values in the +        // original loop. Any incoming values should either: +        //   1. Be for blocks outside the loop, which aren't interesting. Or .. +        //   2. These are from blocks in the loop with values defined outside +        //      the loop. We should a similar incoming value from CmpBB. +        for (BasicBlock *BB : PN.blocks()) +          if (CurLoop->contains(BB)) { +            PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); +            break; +          } +      } +    } +  }; + +  // Ensure all Phis in the successors of CmpBB have an incoming value from it. +  fixSuccessorPhis(EndBB); +  if (EndBB != FoundBB) +    fixSuccessorPhis(FoundBB); + +  // The new CmpBB block isn't part of the loop, but will need to be added to +  // the outer loop if there is one. +  if (!CurLoop->isOutermost()) +    CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI); + +  if (VerifyLoops && CurLoop->getParentLoop()) { +    CurLoop->getParentLoop()->verifyLoop(); +    if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) +      report_fatal_error("Loops must remain in LCSSA form!"); +  } +} diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h new file mode 100644 index 000000000000..cc68425bb68b --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h @@ -0,0 +1,25 @@ +//===- AArch64LoopIdiomTransform.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +struct AArch64LoopIdiomTransformPass +    : PassInfoMixin<AArch64LoopIdiomTransformPass> { +  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, +                        LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 738a52eebad2..380f6e1fcfda 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -810,7 +810,7 @@ defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>;  defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>;  } -let Predicates = [HasSME2p1, HasB16B16] in { +let Predicates = [HasSME2, HasB16B16] in {  defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>;  defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>;  defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r,  nxv8bf16, null_frag>; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 344a15389063..ee10a7d1c706 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -453,6 +453,9 @@ def AArch64msb_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3),  def AArch64eor3 : PatFrags<(ops node:$op1, node:$op2, node:$op3),                             [(int_aarch64_sve_eor3 node:$op1, node:$op2, node:$op3),                              (xor node:$op1, (xor node:$op2, node:$op3))]>; +def AArch64bcax : PatFrags<(ops node:$op1, node:$op2, node:$op3), +                           [(int_aarch64_sve_bcax node:$op1, node:$op2, node:$op3), +                            (xor node:$op1, (and node:$op2, (vnot node:$op3)))]>;  def AArch64fmla_m1 : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm),                                [(int_aarch64_sve_fmla node:$pg, node:$za, node:$zn, node:$zm), @@ -3714,7 +3717,7 @@ let Predicates = [HasSVE2orSME] in {    // SVE2 bitwise ternary operations    defm EOR3_ZZZZ  : sve2_int_bitwise_ternary_op<0b000, "eor3",  AArch64eor3>; -  defm BCAX_ZZZZ  : sve2_int_bitwise_ternary_op<0b010, "bcax",  int_aarch64_sve_bcax>; +  defm BCAX_ZZZZ  : sve2_int_bitwise_ternary_op<0b010, "bcax",  AArch64bcax>;    defm BSL_ZZZZ   : sve2_int_bitwise_ternary_op<0b001, "bsl",   int_aarch64_sve_bsl, AArch64bsp>;    defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;    defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 1a76f354589e..9e43f206efcf 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -172,7 +172,7 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl,  SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(      SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Addr,      SDValue Size, MachinePointerInfo DstPtrInfo, bool ZeroData) const { -  uint64_t ObjSize = cast<ConstantSDNode>(Size)->getZExtValue(); +  uint64_t ObjSize = Size->getAsZExtVal();    assert(ObjSize % 16 == 0);    MachineFunction &MF = DAG.getMachineFunction(); diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 036719be06d8..144610e021c5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,6 +11,7 @@  #include "AArch64TargetMachine.h"  #include "AArch64.h" +#include "AArch64LoopIdiomTransform.h"  #include "AArch64MachineFunctionInfo.h"  #include "AArch64MachineScheduler.h"  #include "AArch64MacroFusion.h" @@ -43,6 +44,7 @@  #include "llvm/MC/MCTargetOptions.h"  #include "llvm/MC/TargetRegistry.h"  #include "llvm/Pass.h" +#include "llvm/Passes/PassBuilder.h"  #include "llvm/Support/CodeGen.h"  #include "llvm/Support/CommandLine.h"  #include "llvm/Target/TargetLoweringObjectFile.h" @@ -222,6 +224,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {    initializeAArch64DeadRegisterDefinitionsPass(*PR);    initializeAArch64ExpandPseudoPass(*PR);    initializeAArch64LoadStoreOptPass(*PR); +  initializeAArch64LoopIdiomTransformLegacyPassPass(*PR);    initializeAArch64MIPeepholeOptPass(*PR);    initializeAArch64SIMDInstrOptPass(*PR);    initializeAArch64O0PreLegalizerCombinerPass(*PR); @@ -537,6 +540,14 @@ public:  } // end anonymous namespace +void AArch64TargetMachine::registerPassBuilderCallbacks( +    PassBuilder &PB, bool PopulateClassToPassNames) { +  PB.registerLateLoopOptimizationsEPCallback( +      [=](LoopPassManager &LPM, OptimizationLevel Level) { +        LPM.addPass(AArch64LoopIdiomTransformPass()); +      }); +} +  TargetTransformInfo  AArch64TargetMachine::getTargetTransformInfo(const Function &F) const {    return TargetTransformInfo(AArch64TTIImpl(this, F)); diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 12b971853f84..8fb68b06f137 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -14,6 +14,7 @@  #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H  #include "AArch64InstrInfo.h" +#include "AArch64LoopIdiomTransform.h"  #include "AArch64Subtarget.h"  #include "llvm/IR/DataLayout.h"  #include "llvm/Target/TargetMachine.h" @@ -43,6 +44,9 @@ public:    // Pass Pipeline Configuration    TargetPassConfig *createPassConfig(PassManagerBase &PM) override; +  void registerPassBuilderCallbacks(PassBuilder &PB, +                                    bool PopulateClassToPassNames) override; +    TargetTransformInfo getTargetTransformInfo(const Function &F) const override;    TargetLoweringObjectFile* getObjFileLowering() const override { diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 0b220069a388..f471294ffc25 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -420,6 +420,8 @@ public:      return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);    } + +  std::optional<unsigned> getMinPageSize() const { return 4096; }  };  } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 1d0e8be80d07..b657a0954d78 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -282,6 +282,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)        // Regardless of FP16 support, widen 16-bit elements to 32-bits.        .minScalar(0, s32)        .libcallFor({s32, s64}); +  getActionDefinitionsBuilder(G_FPOWI) +      .scalarize(0) +      .minScalar(0, s32) +      .libcallFor({{s32, s32}, {s64, s32}});    getActionDefinitionsBuilder(G_INSERT)        .legalIf(all(typeInSet(0, {s32, s64, p0}), @@ -362,7 +366,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)                                   {v4s32, p0, s128, 8},                                   {v2s64, p0, s128, 8}})        // These extends are also legal -      .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}}) +      .legalForTypesWithMemDesc( +          {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})        .widenScalarToNextPow2(0, /* MinSize = */ 8)        .lowerIfMemSizeNotByteSizePow2()        .clampScalar(0, s8, s64) @@ -761,17 +766,35 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)        .lowerIf(            all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); +  LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) { +    return ST.outlineAtomics() && !ST.hasLSE(); +  }; +    getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) -      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))) -      .customIf([](const LegalityQuery &Query) { -        return Query.Types[0].getSizeInBits() == 128; +      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), +                   predNot(UseOutlineAtomics))) +      .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics))) +      .customIf([UseOutlineAtomics](const LegalityQuery &Query) { +        return Query.Types[0].getSizeInBits() == 128 && +               !UseOutlineAtomics(Query);        }) +      .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0), +                     UseOutlineAtomics)) +      .clampScalar(0, s32, s64); + +  getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, +                               G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, +                               G_ATOMICRMW_XOR}) +      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), +                   predNot(UseOutlineAtomics))) +      .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), +                     UseOutlineAtomics))        .clampScalar(0, s32, s64); +  // Do not outline these atomics operations, as per comment in +  // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR().    getActionDefinitionsBuilder( -      {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, -       G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, -       G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) +      {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX})        .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0)))        .clampScalar(0, s32, s64); @@ -989,6 +1012,23 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)        .clampMaxNumElements(1, s16, 8)        .lower(); +  // For fmul reductions we need to split up into individual operations. We +  // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of +  // smaller types, followed by scalarizing what remains. +  getActionDefinitionsBuilder(G_VECREDUCE_FMUL) +      .minScalarOrElt(0, MinFPScalar) +      .clampMaxNumElements(1, s64, 2) +      .clampMaxNumElements(1, s32, 4) +      .clampMaxNumElements(1, s16, 8) +      .clampMaxNumElements(1, s32, 2) +      .clampMaxNumElements(1, s16, 4) +      .scalarize(1) +      .lower(); + +  getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL}) +      .scalarize(2) +      .lower(); +    getActionDefinitionsBuilder(G_VECREDUCE_ADD)        .legalFor({{s8, v16s8},                   {s8, v8s8}, @@ -1137,8 +1177,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)    verify(*ST.getInstrInfo());  } -bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, -                                          MachineInstr &MI) const { +bool AArch64LegalizerInfo::legalizeCustom( +    LegalizerHelper &Helper, MachineInstr &MI, +    LostDebugLocObserver &LocObserver) const {    MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;    MachineRegisterInfo &MRI = *MIRBuilder.getMRI();    GISelChangeObserver &Observer = Helper.Observer; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index 19f77baa77f8..c62a9d847c52 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -23,12 +23,12 @@ namespace llvm {  class AArch64Subtarget; -/// This class provides the information for the target register banks.  class AArch64LegalizerInfo : public LegalizerInfo {  public:    AArch64LegalizerInfo(const AArch64Subtarget &ST); -  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; +  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, +                      LostDebugLocObserver &LocObserver) const override;    bool legalizeIntrinsic(LegalizerHelper &Helper,                           MachineInstr &MI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td index b7552541e950..789ec817d3d8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10082,6 +10082,12 @@ multiclass sve2p1_vector_to_pred<string mnemonic, SDPatternOperator Op_lane, SDP    def : InstAlias<mnemonic # "\t$Pd, $Zn",                   (!cast<Instruction>(NAME # _B) PPR8:$Pd, ZPRAny:$Zn, 0), 1>; +  def : InstAlias<mnemonic # "\t$Pd, $Zn", +                 (!cast<Instruction>(NAME # _H) PPR16:$Pd, ZPRAny:$Zn, 0), 0>; +  def : InstAlias<mnemonic # "\t$Pd, $Zn", +                 (!cast<Instruction>(NAME # _S) PPR32:$Pd, ZPRAny:$Zn, 0), 0>; +  def : InstAlias<mnemonic # "\t$Pd, $Zn", +                 (!cast<Instruction>(NAME # _D) PPR64:$Pd, ZPRAny:$Zn, 0), 0>;    // any_lane    def : Pat<(nxv16i1 (Op_lane (nxv16i8 ZPRAny:$Zn), (i32 timm32_0_0:$Idx))), @@ -10143,6 +10149,12 @@ multiclass sve2p1_pred_to_vector<string mnemonic, SDPatternOperator MergeOp,    def : InstAlias<mnemonic # "\t$Zd, $Pn",                   (!cast<Instruction>(NAME # _B) ZPRAny:$Zd, 0, PPR8:$Pn), 1>; +  def : InstAlias<mnemonic # "\t$Zd, $Pn", +                 (!cast<Instruction>(NAME # _H) ZPRAny:$Zd, 0, PPR16:$Pn), 0>; +  def : InstAlias<mnemonic # "\t$Zd, $Pn", +                 (!cast<Instruction>(NAME # _S) ZPRAny:$Zd, 0, PPR32:$Pn), 0>; +  def : InstAlias<mnemonic # "\t$Zd, $Pn", +                 (!cast<Instruction>(NAME # _D) ZPRAny:$Zd, 0, PPR64:$Pn), 0>;    // Merge    def : Pat<(nxv8i16 (MergeOp (nxv8i16 ZPRAny:$Zd), (nxv8i1 PPR16:$Pn), (i32 timm32_1_1:$Idx))), diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td index d2a325d5ad89..df8c35ffd457 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -761,6 +761,12 @@ def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register",    "Has SHADER_CYCLES hardware register"  >; +def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers", +  "HasShaderCyclesHiLoRegisters", +  "true", +  "Has SHADER_CYCLES_HI/LO hardware registers" +>; +  def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",    "HasMadMacF32Insts",    "true", @@ -1469,7 +1475,7 @@ def FeatureISAVersion12 : FeatureSet<     FeatureNSAEncoding,     FeaturePartialNSAEncoding,     FeatureWavefrontSize32, -   FeatureShaderCyclesRegister, +   FeatureShaderCyclesHiLoRegisters,     FeatureArchitectedFlatScratch,     FeatureAtomicFaddRtnInsts,     FeatureAtomicFaddNoRtnInsts, @@ -1970,6 +1976,8 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,  def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,    AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>; +def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">; +  def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,    AssemblerPredicate<(all_of FeatureFP8Insts)>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 9036b26a6f6b..c5207228dc91 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -22,28 +22,28 @@ def CC_SI_Gfx : CallingConv<[    // 32 is reserved for the stack pointer    // 33 is reserved for the frame pointer    // 34 is reserved for the base pointer -  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ +  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[      SGPR4, SGPR5, SGPR6, SGPR7,      SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,      SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,      SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29    ]>>>, -  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ +  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[      VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,      VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,      VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,      VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31    ]>>>, -  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>> +  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>  ]>;  def RetCC_SI_Gfx : CallingConv<[    CCIfType<[i1], CCPromoteToType<i32>>,    CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, -  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ +  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[      VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,      VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,      VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -66,7 +66,7 @@ def RetCC_SI_Gfx : CallingConv<[  def CC_SI_SHADER : CallingConv<[ -  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ +  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[      SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,      SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,      SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -76,7 +76,7 @@ def CC_SI_SHADER : CallingConv<[    ]>>>,    // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. -  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ +  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[      VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,      VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,      VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -109,7 +109,7 @@ def RetCC_SI_Shader : CallingConv<[    ]>>,    // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. -  CCIfType<[f32, f16, v2f16] , CCAssignToReg<[ +  CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<[      VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,      VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,      VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -188,23 +188,23 @@ def CC_AMDGPU_Func : CallingConv<[    CCIfType<[i1], CCPromoteToType<i32>>,    CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>, -  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< +  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<      !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29    >>>, -  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ +  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1, bf16, v2bf16], CCAssignToReg<[      VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,      VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,      VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,      VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, -  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>> +  CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>  ]>;  // Calling convention for leaf functions  def RetCC_AMDGPU_Func : CallingConv<[    CCIfType<[i1], CCPromoteToType<i32>>,    CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, -  CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[ +  CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[      VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,      VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,      VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -223,11 +223,11 @@ def CC_AMDGPU : CallingConv<[  ]>;  def CC_AMDGPU_CS_CHAIN : CallingConv<[ -  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< +  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<      !foreach(i, !range(105), !cast<Register>("SGPR"#i))    >>>, -  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< +  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<      !foreach(i, !range(8, 255), !cast<Register>("VGPR"#i))    >>>  ]>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 8d4cad4c07bc..0c77fe725958 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -104,6 +104,13 @@ def foldable_fneg : GICombineRule<           [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]),    (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>; +// Detects s_mul_u64 instructions whose higher bits are zero/sign extended. +def smulu64 : GICombineRule< +  (defs root:$smul, unsigned_matchinfo:$matchinfo), +  (match (wip_match_opcode G_MUL):$smul, +         [{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]), +  (apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>; +  def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">;  def sign_extension_in_reg : GICombineRule< @@ -149,7 +156,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<    "AMDGPUPostLegalizerCombinerImpl",    [all_combines, gfx6gfx7_combines, gfx8_combines,     uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, -   rcp_sqrt_to_rsq, sign_extension_in_reg]> { +   rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> {    let CombineAllMethodName = "tryCombineAllImpl";  } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b51a876750b5..74e9cd7d0965 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(      Offset += 8; // Skipped.    } -  Offset += 72; // Reserved. +  // Emit argument for hidden dynamic lds size +  if (MFI.isDynamicLDSUsed()) { +    emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset, +                  Args); +  } else { +    Offset += 4; // skipped +  } + +  Offset += 68; // Reserved.    // hidden_private_base and hidden_shared_base are only when the subtarget has    // ApertureRegs. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index bffea82ab8f4..719ae2e8750c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -303,6 +303,7 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {      switch (N->getOpcode()) {      case ISD::BUILD_VECTOR: +      // TODO: Match load d16 from shl (extload:i16), 16        MadeChange |= matchLoadD16FromBuildVector(N);        break;      default: @@ -317,26 +318,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {    }  } -bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, -                                           bool Negated) const { +bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {    if (N->isUndef())      return true;    const SIInstrInfo *TII = Subtarget->getInstrInfo(); -  if (Negated) { -    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) -      return TII->isInlineConstant(-C->getAPIntValue()); +  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) +    return TII->isInlineConstant(C->getAPIntValue()); -    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) -      return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); - -  } else { -    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) -      return TII->isInlineConstant(C->getAPIntValue()); - -    if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) -      return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); -  } +  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) +    return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());    return false;  } @@ -382,7 +373,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,          Subtarget->getRegisterInfo()->getRegClass(RCID);      SDValue SubRegOp = N->getOperand(OpNo + 1); -    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); +    unsigned SubRegIdx = SubRegOp->getAsZExtVal();      return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,                                                                SubRegIdx);    } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 374108af08cd..df4a211d42a0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -50,15 +50,13 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) {  }  // TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, -                                        bool Negate = false) { +static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) {    assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);    uint32_t LHSVal, RHSVal;    if (getConstantValue(N->getOperand(0), LHSVal) &&        getConstantValue(N->getOperand(1), RHSVal)) {      SDLoc SL(N); -    uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16) -                        : (LHSVal & 0xffff) | (RHSVal << 16); +    uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16);      return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),                                DAG.getTargetConstant(K, SL, MVT::i32));    } @@ -66,9 +64,6 @@ static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,    return nullptr;  } -static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { -  return packConstantV2I16(N, DAG, true); -}  } // namespace  /// AMDGPU specific code to select AMDGPU machine instructions for @@ -110,10 +105,7 @@ protected:  private:    std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; -  bool isInlineImmediate(const SDNode *N, bool Negated = false) const; -  bool isNegInlineImmediate(const SDNode *N) const { -    return isInlineImmediate(N, true); -  } +  bool isInlineImmediate(const SDNode *N) const;    bool isInlineImmediate16(int64_t Imm) const {      return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm()); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8fbc90a6db9f..0dbcaf5a1b13 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -387,17 +387,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,                        MVT::v9i32,  MVT::v9f32,  MVT::v10i32, MVT::v10f32,                        MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},                       Custom); + +  // FIXME: Why is v8f16/v8bf16 missing?    setOperationAction(        ISD::EXTRACT_SUBVECTOR, -      {MVT::v2f16,  MVT::v2i16,  MVT::v4f16,  MVT::v4i16,  MVT::v2f32, -       MVT::v2i32,  MVT::v3f32,  MVT::v3i32,  MVT::v4f32,  MVT::v4i32, -       MVT::v5f32,  MVT::v5i32,  MVT::v6f32,  MVT::v6i32,  MVT::v7f32, -       MVT::v7i32,  MVT::v8f32,  MVT::v8i32,  MVT::v9f32,  MVT::v9i32, -       MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, -       MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32, -       MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,  MVT::v3f64, -       MVT::v3i64,  MVT::v4f64,  MVT::v4i64,  MVT::v8f64,  MVT::v8i64, -       MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16}, +      {MVT::v2f16,  MVT::v2bf16, MVT::v2i16,  MVT::v4f16,  MVT::v4bf16, +       MVT::v4i16,  MVT::v2f32,  MVT::v2i32,  MVT::v3f32,  MVT::v3i32, +       MVT::v4f32,  MVT::v4i32,  MVT::v5f32,  MVT::v5i32,  MVT::v6f32, +       MVT::v6i32,  MVT::v7f32,  MVT::v7i32,  MVT::v8f32,  MVT::v8i32, +       MVT::v9f32,  MVT::v9i32,  MVT::v10i32, MVT::v10f32, MVT::v11i32, +       MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16, +       MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, +       MVT::v2f64,  MVT::v2i64,  MVT::v3f64,  MVT::v3i64,  MVT::v4f64, +       MVT::v4i64,  MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64, +       MVT::v32i16, MVT::v32f16, MVT::v32bf16},        Custom);    setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); @@ -3281,7 +3284,15 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,      return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);    } -  assert(SrcVT == MVT::i64 && "operation should be legal"); +  if (DestVT == MVT::bf16) { +    SDLoc SL(Op); +    SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src); +    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); +    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); +  } + +  if (SrcVT != MVT::i64) +    return Op;    if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {      SDLoc DL(Op); @@ -3319,7 +3330,15 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,      return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);    } -  assert(SrcVT == MVT::i64 && "operation should be legal"); +  if (DestVT == MVT::bf16) { +    SDLoc SL(Op); +    SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src); +    SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); +    return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); +  } + +  if (SrcVT != MVT::i64) +    return Op;    // TODO: Factor out code common with LowerUINT_TO_FP. @@ -3517,7 +3536,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con    return DAG.getZExtOrTrunc(V, DL, Op.getValueType());  } -SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op, +SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op,                                               SelectionDAG &DAG) const {    SDValue Src = Op.getOperand(0);    unsigned OpOpcode = Op.getOpcode(); @@ -3528,6 +3547,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,    if (SrcVT == MVT::f16 && DestVT == MVT::i16)      return Op; +  if (SrcVT == MVT::bf16) { +    SDLoc DL(Op); +    SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); +    return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc); +  } +    // Promote i16 to i32    if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {      SDLoc DL(Op); @@ -3536,6 +3561,9 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);    } +  if (DestVT != MVT::i64) +    return Op; +    if (SrcVT == MVT::f16 ||        (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {      SDLoc DL(Op); @@ -3546,7 +3574,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,      return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);    } -  if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) +  if (SrcVT == MVT::f32 || SrcVT == MVT::f64)      return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);    return SDValue(); @@ -4947,7 +4975,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,      //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))      if (DestVT.isVector()) {        SDValue Src = N->getOperand(0); -      if (Src.getOpcode() == ISD::BUILD_VECTOR) { +      if (Src.getOpcode() == ISD::BUILD_VECTOR && +          (DCI.getDAGCombineLevel() < AfterLegalizeDAG || +           isOperationLegal(ISD::BUILD_VECTOR, DestVT))) {          EVT SrcVT = Src.getValueType();          unsigned NElts = DestVT.getVectorNumElements(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 88ef4b577424..ad8dcda93c36 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2764,7 +2764,9 @@ static bool isConstant(const MachineInstr &MI) {  void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,      const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { -  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); +  unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1; +  const MachineInstr *PtrMI = +      MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());    assert(PtrMI); @@ -2817,6 +2819,10 @@ bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {    if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)      return true; +  if (MI.getOpcode() == AMDGPU::G_PREFETCH) +    return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() == +           AMDGPU::SGPRRegBankID; +    const Instruction *I = dyn_cast<Instruction>(Ptr);    return I && I->getMetadata("amdgpu.uniform");  } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fbee28889451..aa235c07e995 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -701,13 +701,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,            .maxScalar(0, S32);      } -    getActionDefinitionsBuilder(G_MUL) -      .legalFor({S32, S16, V2S16}) -      .clampMaxNumElementsStrict(0, S16, 2) -      .scalarize(0) -      .minScalar(0, S16) -      .widenScalarToNextMultipleOf(0, 32) -      .custom(); +    if (ST.hasScalarSMulU64()) { +      getActionDefinitionsBuilder(G_MUL) +          .legalFor({S64, S32, S16, V2S16}) +          .clampMaxNumElementsStrict(0, S16, 2) +          .scalarize(0) +          .minScalar(0, S16) +          .widenScalarToNextMultipleOf(0, 32) +          .custom(); +    } else { +      getActionDefinitionsBuilder(G_MUL) +          .legalFor({S32, S16, V2S16}) +          .clampMaxNumElementsStrict(0, S16, 2) +          .scalarize(0) +          .minScalar(0, S16) +          .widenScalarToNextMultipleOf(0, 32) +          .custom(); +    }      assert(ST.hasMad64_32());      getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) @@ -1996,8 +2006,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,    verify(*ST.getInstrInfo());  } -bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, -                                         MachineInstr &MI) const { +bool AMDGPULegalizerInfo::legalizeCustom( +    LegalizerHelper &Helper, MachineInstr &MI, +    LostDebugLocObserver &LocObserver) const {    MachineIRBuilder &B = Helper.MIRBuilder;    MachineRegisterInfo &MRI = *B.getMRI(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 855fa0ddc214..56aabd4f6ab7 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -27,7 +27,6 @@ class MachineIRBuilder;  namespace AMDGPU {  struct ImageDimIntrinsicInfo;  } -/// This class provides the information for the target register banks.  class AMDGPULegalizerInfo final : public LegalizerInfo {    const GCNSubtarget &ST; @@ -35,7 +34,8 @@ public:    AMDGPULegalizerInfo(const GCNSubtarget &ST,                        const GCNTargetMachine &TM); -  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; +  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, +                      LostDebugLocObserver &LocObserver) const override;    Register getSegmentAperture(unsigned AddrSpace,                                MachineRegisterInfo &MRI, diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index f03e6b8915b1..1b2f74cf153b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -87,9 +87,6 @@ private:                                Constant *copr0, Constant *copr1);    bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo); -  // sqrt -  bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); -    /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value    /// of cos, sincos call).    std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg, @@ -672,8 +669,6 @@ bool AMDGPULibCalls::fold(CallInst *CI) {      // Specialized optimizations for each function call.      // -    // TODO: Handle other simple intrinsic wrappers. Sqrt. -    //      // TODO: Handle native functions      switch (FInfo.getId()) {      case AMDGPULibFunc::EI_EXP: @@ -794,7 +789,9 @@ bool AMDGPULibCalls::fold(CallInst *CI) {      case AMDGPULibFunc::EI_ROOTN:        return fold_rootn(FPOp, B, FInfo);      case AMDGPULibFunc::EI_SQRT: -      return fold_sqrt(FPOp, B, FInfo); +      // TODO: Allow with strictfp + constrained intrinsic +      return tryReplaceLibcallWithSimpleIntrinsic( +          B, CI, Intrinsic::sqrt, true, true, /*AllowStrictFP=*/false);      case AMDGPULibFunc::EI_COS:      case AMDGPULibFunc::EI_SIN:        return fold_sincos(FPOp, B, FInfo); @@ -1273,29 +1270,6 @@ bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(    return true;  } -// fold sqrt -> native_sqrt (x) -bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, -                               const FuncInfo &FInfo) { -  if (!isUnsafeMath(FPOp)) -    return false; - -  if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) && -      (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) { -    Module *M = B.GetInsertBlock()->getModule(); - -    if (FunctionCallee FPExpr = getNativeFunction( -            M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { -      Value *opr0 = FPOp->getOperand(0); -      LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " -                        << "sqrt(" << *opr0 << ")\n"); -      Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt"); -      replaceCall(FPOp, nval); -      return true; -    } -  } -  return false; -} -  std::tuple<Value *, Value *, Value *>  AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,                               FunctionCallee Fsincos) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 323462e60a29..31777295b4f8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -19,6 +19,26 @@  using namespace llvm; +static const GlobalVariable * +getKernelDynLDSGlobalFromFunction(const Function &F) { +  const Module *M = F.getParent(); +  SmallString<64> KernelDynLDSName("llvm.amdgcn."); +  KernelDynLDSName += F.getName(); +  KernelDynLDSName += ".dynlds"; +  return M->getNamedGlobal(KernelDynLDSName); +} + +static bool hasLDSKernelArgument(const Function &F) { +  for (const Argument &Arg : F.args()) { +    Type *ArgTy = Arg.getType(); +    if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) { +      if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) +        return true; +    } +  } +  return false; +} +  AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,                                               const AMDGPUSubtarget &ST)      : IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())), @@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,    Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math");    NoSignedZerosFPMath =        NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true"; + +  const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F); +  if (DynLdsGlobal || hasLDSKernelArgument(F)) +    UsesDynamicLDS = true;  }  unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, @@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,    return Offset;  } -static const GlobalVariable * -getKernelDynLDSGlobalFromFunction(const Function &F) { -  const Module *M = F.getParent(); -  std::string KernelDynLDSName = "llvm.amdgcn."; -  KernelDynLDSName += F.getName(); -  KernelDynLDSName += ".dynlds"; -  return M->getNamedGlobal(KernelDynLDSName); -} -  std::optional<uint32_t>  AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {    // TODO: Would be more consistent with the abs symbols to use a range @@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,      }    }  } + +void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) { +  UsesDynamicLDS = DynLDS; +} + +bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 248ee26a47eb..7efb7f825348 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -46,6 +46,9 @@ protected:    /// stages.    Align DynLDSAlign; +  // Flag to check dynamic LDS usage by kernel. +  bool UsesDynamicLDS = false; +    // Kernels + shaders. i.e. functions called by the hardware and not called    // by other functions.    bool IsEntryFunction = false; @@ -119,6 +122,10 @@ public:    Align getDynLDSAlign() const { return DynLDSAlign; }    void setDynLDSAlign(const Function &F, const GlobalVariable &GV); + +  void setUsesDynamicLDS(bool DynLDS); + +  bool isDynamicLDSUsed() const;  };  } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 7b18e1f805d8..21bfab52c6c4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -104,6 +104,14 @@ public:    void applyCombineSignExtendInReg(MachineInstr &MI,                                     MachineInstr *&MatchInfo) const; +  // Find the s_mul_u64 instructions where the higher bits are either +  // zero-extended or sign-extended. +  bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; +  // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher +  // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32 +  // bits are zero extended. +  void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; +  private:  #define GET_GICOMBINER_CLASS_MEMBERS  #define AMDGPUSubtarget GCNSubtarget @@ -419,6 +427,32 @@ void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(    MI.eraseFromParent();  } +bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( +    MachineInstr &MI, unsigned &NewOpcode) const { +  Register Src0 = MI.getOperand(1).getReg(); +  Register Src1 = MI.getOperand(2).getReg(); +  if (MRI.getType(Src0) != LLT::scalar(64)) +    return false; + +  if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 && +      KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) { +    NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32; +    return true; +  } + +  if (KB->computeNumSignBits(Src1) >= 33 && +      KB->computeNumSignBits(Src0) >= 33) { +    NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32; +    return true; +  } +  return false; +} + +void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64( +    MachineInstr &MI, unsigned &NewOpcode) const { +  Helper.replaceOpcodeWith(MI, NewOpcode); +} +  // Pass boilerplate  // ================ diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index fba060464a6e..391c2b9ec256 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -441,7 +441,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(  // FIXME: Returns uniform if there's no source value information. This is  // probably wrong. -static bool isScalarLoadLegal(const MachineInstr &MI) { +bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {    if (!MI.hasOneMemOperand())      return false; @@ -2094,6 +2094,74 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(    return true;  } +// Break s_mul_u64 into 32-bit vector operations. +void AMDGPURegisterBankInfo::applyMappingSMULU64( +    MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { +  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); +  SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); +  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); + +  // All inputs are SGPRs, nothing special to do. +  if (DefRegs.empty()) { +    assert(Src0Regs.empty() && Src1Regs.empty()); +    applyDefaultMapping(OpdMapper); +    return; +  } + +  assert(DefRegs.size() == 2); +  assert(Src0Regs.size() == Src1Regs.size() && +         (Src0Regs.empty() || Src0Regs.size() == 2)); + +  MachineRegisterInfo &MRI = OpdMapper.getMRI(); +  MachineInstr &MI = OpdMapper.getMI(); +  Register DstReg = MI.getOperand(0).getReg(); +  LLT HalfTy = LLT::scalar(32); + +  // Depending on where the source registers came from, the generic code may +  // have decided to split the inputs already or not. If not, we still need to +  // extract the values. + +  if (Src0Regs.empty()) +    split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); +  else +    setRegsToType(MRI, Src0Regs, HalfTy); + +  if (Src1Regs.empty()) +    split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); +  else +    setRegsToType(MRI, Src1Regs, HalfTy); + +  setRegsToType(MRI, DefRegs, HalfTy); + +  // The multiplication is done as follows: +  // +  //                            Op1H  Op1L +  //                          * Op0H  Op0L +  //                       -------------------- +  //                       Op1H*Op0L  Op1L*Op0L +  //          + Op1H*Op0H  Op1L*Op0H +  // ----------------------------------------- +  // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L +  // +  //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit +  //  value and that would overflow. +  //  The low 32-bit value is Op1L*Op0L. +  //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from +  //  Op1L*Op0L). + +  ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); + +  Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0); +  Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0); +  Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0); +  Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0); +  B.buildAdd(DefRegs[1], Add, MulHiLo); +  B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]); + +  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); +  MI.eraseFromParent(); +} +  void AMDGPURegisterBankInfo::applyMappingImpl(      MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {    MachineInstr &MI = OpdMapper.getMI(); @@ -2394,13 +2462,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(      Register DstReg = MI.getOperand(0).getReg();      LLT DstTy = MRI.getType(DstReg); +    // Special case for s_mul_u64. There is not a vector equivalent of +    // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector +    // multiplications. +    if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { +      applyMappingSMULU64(B, OpdMapper); +      return; +    } +      // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.      // Packed 16-bit operations need to be scalarized and promoted.      if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))        break;      const RegisterBank *DstBank = -      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; +        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;      if (DstBank == &AMDGPU::VGPRRegBank)        break; @@ -2451,6 +2527,72 @@ void AMDGPURegisterBankInfo::applyMappingImpl(      return;    } +  case AMDGPU::G_AMDGPU_S_MUL_I64_I32: +  case AMDGPU::G_AMDGPU_S_MUL_U64_U32: { +    // This is a special case for s_mul_u64. We use +    // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation +    // where the 33 higher bits are sign-extended and +    // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation +    // where the 32 higher bits are zero-extended. In case scalar registers are +    // selected, both opcodes are lowered as s_mul_u64. If the vector registers +    // are selected, then G_AMDGPU_S_MUL_I64_I32 and +    // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction. + +    // Insert basic copies. +    applyDefaultMapping(OpdMapper); + +    Register DstReg = MI.getOperand(0).getReg(); +    Register SrcReg0 = MI.getOperand(1).getReg(); +    Register SrcReg1 = MI.getOperand(2).getReg(); +    const LLT S32 = LLT::scalar(32); +    const LLT S64 = LLT::scalar(64); +    assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 " +                                         "that handles only 64-bit operands."); +    const RegisterBank *DstBank = +        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + +    // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 +    // with s_mul_u64 operation. +    if (DstBank == &AMDGPU::SGPRRegBank) { +      MI.setDesc(TII->get(AMDGPU::S_MUL_U64)); +      MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass); +      MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass); +      MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass); +      return; +    } + +    // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 +    // with a vector mad. +    assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank && +           "The destination operand should be in vector registers."); + +    DebugLoc DL = MI.getDebugLoc(); + +    // Extract the lower subregister from the first operand. +    Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +    MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass); +    MRI.setType(Op0L, S32); +    B.buildTrunc(Op0L, SrcReg0); + +    // Extract the lower subregister from the second operand. +    Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +    MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass); +    MRI.setType(Op1L, S32); +    B.buildTrunc(Op1L, SrcReg1); + +    unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32 +                          ? AMDGPU::G_AMDGPU_MAD_U64_U32 +                          : AMDGPU::G_AMDGPU_MAD_I64_I32; + +    MachineIRBuilder B(MI); +    Register Zero64 = B.buildConstant(S64, 0).getReg(0); +    MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass); +    Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); +    MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass); +    B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64}); +    MI.eraseFromParent(); +    return; +  }    case AMDGPU::G_SEXT_INREG: {      SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));      if (SrcRegs.empty()) @@ -3263,17 +3405,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(        MI.eraseFromParent();        return;      } -    unsigned PtrBank = -        getRegBankID(MI.getOperand(0).getReg(), MRI, AMDGPU::SGPRRegBankID); +    Register PtrReg = MI.getOperand(0).getReg(); +    unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);      if (PtrBank == AMDGPU::VGPRRegBankID) {        MI.eraseFromParent();        return;      } -    // FIXME: There is currently no support for prefetch in global isel. -    // There is no node equivalence and what's worse there is no MMO produced -    // for a prefetch on global isel path. -    // Prefetch does not affect execution so erase it for now. -    MI.eraseFromParent(); +    unsigned AS = MRI.getType(PtrReg).getAddressSpace(); +    if (!AMDGPU::isFlatGlobalAddrSpace(AS) && +        AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { +      MI.eraseFromParent(); +      return; +    } +    applyDefaultMapping(OpdMapper);      return;    }    default: @@ -3667,7 +3811,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {    case AMDGPU::G_AND:    case AMDGPU::G_OR: -  case AMDGPU::G_XOR: { +  case AMDGPU::G_XOR: +  case AMDGPU::G_MUL: {      unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();      if (Size == 1) {        const RegisterBank *DstBank @@ -3735,7 +3880,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {    case AMDGPU::G_PTRMASK:    case AMDGPU::G_ADD:    case AMDGPU::G_SUB: -  case AMDGPU::G_MUL:    case AMDGPU::G_SHL:    case AMDGPU::G_LSHR:    case AMDGPU::G_ASHR: @@ -3753,6 +3897,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {    case AMDGPU::G_SHUFFLE_VECTOR:    case AMDGPU::G_SBFX:    case AMDGPU::G_UBFX: +  case AMDGPU::G_AMDGPU_S_MUL_I64_I32: +  case AMDGPU::G_AMDGPU_S_MUL_U64_U32:      if (isSALUMapping(MI))        return getDefaultMappingSOP(MI);      return getDefaultMappingVOP(MI); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index b5d16e70ab23..5f550b426ec0 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -84,6 +84,9 @@ public:    bool applyMappingMAD_64_32(MachineIRBuilder &B,                               const OperandsMapper &OpdMapper) const; +  void applyMappingSMULU64(MachineIRBuilder &B, +                           const OperandsMapper &OpdMapper) const; +    Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,                            Register Reg) const; @@ -173,6 +176,8 @@ public:    const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,                                               LLT) const override; +  bool isScalarLoadLegal(const MachineInstr &MI) const; +    InstructionMappings    getInstrAlternativeMappings(const MachineInstr &MI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index fdc2077868cf..0f3bb3e7b0d8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -620,7 +620,8 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {    AAM.registerFunctionAnalysis<AMDGPUAA>();  } -void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void AMDGPUTargetMachine::registerPassBuilderCallbacks( +    PassBuilder &PB, bool PopulateClassToPassNames) {    PB.registerPipelineParsingCallback(        [this](StringRef PassName, ModulePassManager &PM,               ArrayRef<PassBuilder::PipelineElement>) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 9051a61e6557..99c9db3e654a 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -51,7 +51,8 @@ public:      return TLOF.get();    } -  void registerPassBuilderCallbacks(PassBuilder &PB) override; +  void registerPassBuilderCallbacks(PassBuilder &PB, +                                    bool PopulateClassToPassNames) override;    void registerDefaultAliasAnalyses(AAManager &) override;    /// Get the integer value of a null pointer in the given address space. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index abd7e911beef..b7f043860115 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -166,6 +166,8 @@ public:      ImmTyEndpgm,      ImmTyWaitVDST,      ImmTyWaitEXP, +    ImmTyWaitVAVDst, +    ImmTyWaitVMVSrc,    };    // Immediate operand kind. @@ -909,6 +911,8 @@ public:    bool isEndpgm() const;    bool isWaitVDST() const;    bool isWaitEXP() const; +  bool isWaitVAVDst() const; +  bool isWaitVMVSrc() const;    auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const {      return std::bind(P, *this); @@ -1029,6 +1033,7 @@ public:    }    static void printImmTy(raw_ostream& OS, ImmTy Type) { +    // clang-format off      switch (Type) {      case ImmTyNone: OS << "None"; break;      case ImmTyGDS: OS << "GDS"; break; @@ -1086,7 +1091,10 @@ public:      case ImmTyEndpgm: OS << "Endpgm"; break;      case ImmTyWaitVDST: OS << "WaitVDST"; break;      case ImmTyWaitEXP: OS << "WaitEXP"; break; +    case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break; +    case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;      } +    // clang-format on    }    void print(raw_ostream &OS) const override { @@ -1857,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {    case AMDGPU::OPERAND_REG_IMM_V2FP32:    case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:    case AMDGPU::OPERAND_REG_IMM_V2INT32: +  case AMDGPU::OPERAND_REG_IMM_V2INT16: +  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: +  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:    case AMDGPU::OPERAND_KIMM32:    case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:      return &APFloat::IEEEsingle(); @@ -1871,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {    case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:    case AMDGPU::OPERAND_REG_INLINE_C_INT16:    case AMDGPU::OPERAND_REG_INLINE_C_FP16: -  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:    case AMDGPU::OPERAND_REG_INLINE_AC_INT16:    case AMDGPU::OPERAND_REG_INLINE_AC_FP16: -  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:    case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: -  case AMDGPU::OPERAND_REG_IMM_V2INT16:    case AMDGPU::OPERAND_REG_IMM_V2FP16:    case AMDGPU::OPERAND_KIMM16:      return &APFloat::IEEEhalf(); @@ -2025,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {    // We allow fp literals with f16x2 operands assuming that the specified    // literal goes into the lower half and the upper half is zero. We also    // require that the literal may be losslessly converted to f16. -  MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : -                     (type == MVT::v2i16)? MVT::i16 : -                     (type == MVT::v2f32)? MVT::f32 : type; +  // +  // For i16x2 operands, we assume that the specified literal is encoded as a +  // single-precision float. This is pretty odd, but it matches SP3 and what +  // happens in hardware. +  MVT ExpectedType = (type == MVT::v2f16)   ? MVT::f16 +                     : (type == MVT::v2i16) ? MVT::f32 +                     : (type == MVT::v2f32) ? MVT::f32 +                                            : type;    APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val));    return canLosslesslyConvertToFPType(FPLiteral, ExpectedType); @@ -3393,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,      if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||          OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||          OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16) -      return AMDGPU::isInlinableIntLiteralV216(Val); +      return AMDGPU::isInlinableLiteralV2I16(Val);      if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||          OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||          OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) -      return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); +      return AMDGPU::isInlinableLiteralV2F16(Val);      return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());    } @@ -9192,6 +9205,14 @@ bool AMDGPUOperand::isWaitVDST() const {    return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());  } +bool AMDGPUOperand::isWaitVAVDst() const { +  return isImmTy(ImmTyWaitVAVDst) && isUInt<4>(getImm()); +} + +bool AMDGPUOperand::isWaitVMVSrc() const { +  return isImmTy(ImmTyWaitVMVSrc) && isUInt<1>(getImm()); +} +  //===----------------------------------------------------------------------===//  // VINTERP  //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td index 43d35fa5291c..9e99d382ed9b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -503,7 +503,6 @@ class MUBUF_Load_Pseudo <string opName,    let has_vdata = !not(!or(isLds, isLdsOpc));    let mayLoad = 1;    let mayStore = isLds; -  let maybeAtomic = 1;    let Uses = !if(!or(isLds, isLdsOpc) , [EXEC, M0], [EXEC]);    let tfe = isTFE;    let lds = isLds; @@ -610,7 +609,6 @@ class MUBUF_Store_Pseudo <string opName,                      getAddrName<addrKindCopy>.ret;    let mayLoad = 0;    let mayStore = 1; -  let maybeAtomic = 1;    let elements = getMUBUFElements<store_vt>.ret;    let tfe = isTFE;  } @@ -671,7 +669,6 @@ class MUBUF_Pseudo_Store_Lds<string opName>    let LGKM_CNT = 1;    let mayLoad = 1;    let mayStore = 1; -  let maybeAtomic = 1;    let has_vdata = 0;    let has_vaddr = 0; @@ -735,7 +732,6 @@ class MUBUF_Atomic_Pseudo<string opName,    let has_glc = 0;    let has_dlc = 0;    let has_sccb = 1; -  let maybeAtomic = 1;    let AsmMatchConverter = "cvtMubufAtomic";  } @@ -1222,8 +1218,10 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <  } // End HasD16LoadStore -def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", -                                       int_amdgcn_buffer_wbinvl1>; +let SubtargetPredicate = isNotGFX12Plus in +def BUFFER_WBINVL1 : MUBUF_Invalidate < +  "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1 +>;  let SubtargetPredicate = HasAtomicFaddNoRtnInsts in  defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN< diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSDIRInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSDIRInstructions.td new file mode 100644 index 000000000000..4416da605981 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSDIRInstructions.td @@ -0,0 +1,192 @@ +//===-- DSDIRInstructions.td - LDS/VDS Direct Instruction Definitions -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// LDSDIR/VDSDIR encoding (LDSDIR is gfx11, VDSDIR is gfx12+) +//===----------------------------------------------------------------------===// + +class LDSDIRe<bits<2> op, bit is_direct> : Enc32 { +  // encoding fields +  bits<2> attrchan; +  bits<6> attr; +  bits<4> waitvdst; +  bits<8> vdst; + +  // encoding +  let Inst{31-24} = 0xce; // encoding +  let Inst{23-22} = 0x0; // reserved +  let Inst{21-20} = op; +  let Inst{19-16} = waitvdst; +  let Inst{15-10} = !if(is_direct, ?, attr); +  let Inst{9-8} = !if(is_direct, ?, attrchan); +  let Inst{7-0} = vdst; +} + +class VDSDIRe<bits<2> op, bit is_direct> : Enc32 { +  // encoding fields +  bits<2> attrchan; +  bits<6> attr; +  bits<4> waitvdst; +  bits<8> vdst; +  bits<1> waitvsrc; + +  // encoding +  let Inst{31-24} = 0xce; // encoding +  let Inst{23} = waitvsrc; +  let Inst{22} = 0x0; // reserved +  let Inst{21-20} = op; +  let Inst{19-16} = waitvdst; +  let Inst{15-10} = !if(is_direct, ?, attr); +  let Inst{9-8} = !if(is_direct, ?, attrchan); +  let Inst{7-0} = vdst; +} + +//===----------------------------------------------------------------------===// +// LDSDIR/VDSDIR Classes +//===----------------------------------------------------------------------===// + +class LDSDIR_getIns<bit direct> { +  dag ret = !if(direct, +    (ins wait_vdst:$waitvdst), +    (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst) +  ); +} + +class VDSDIR_getIns<bit direct> { +  dag ret = !if(direct, +    (ins wait_va_vdst:$waitvdst, wait_va_vsrc:$waitvsrc), +    (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_va_vdst:$waitvdst, +         wait_va_vsrc:$waitvsrc) +  ); +} + +class DSDIR_Common<string opName, string asm = "", dag ins, bit direct> : +  InstSI<(outs VGPR_32:$vdst), ins, asm> { +  let LDSDIR = 1; +  let EXP_CNT = 1; + +  let hasSideEffects = 0; +  let mayLoad = 1; +  let mayStore = 0; +  let maybeAtomic = 0; + +  string Mnemonic = opName; +  let UseNamedOperandTable = 1; + +  let Uses = [M0, EXEC]; +  let DisableWQM = 0; +  let SchedRW = [WriteLDS]; + +  bit is_direct; +  let is_direct = direct; +} + +class DSDIR_Pseudo<string opName, dag ins, bit direct> : +  DSDIR_Common<opName, "", ins, direct>, +  SIMCInstr<opName, SIEncodingFamily.NONE> { +  let isPseudo = 1; +  let isCodeGenOnly = 1; +} + +class LDSDIR_getAsm<bit direct> { +  string ret = !if(direct, +    " $vdst$waitvdst", +    " $vdst, $attr$attrchan$waitvdst" +  ); +} + +class VDSDIR_getAsm<bit direct> { +  string ret = !if(direct, +    " $vdst$waitvdst$waitvsrc", +    " $vdst, $attr$attrchan$waitvdst$waitvsrc" +  ); +} + +class DSDIR_Real<DSDIR_Pseudo lds, dag ins, string asm, int subtarget> : +  DSDIR_Common<lds.Mnemonic, +               lds.Mnemonic # asm, +               ins, +               lds.is_direct>, +  SIMCInstr <lds.Mnemonic, subtarget> { +  let isPseudo = 0; +  let isCodeGenOnly = 0; +} + +//===----------------------------------------------------------------------===// +// LDS/VDS Direct Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isGFX11Only in { + +def LDS_DIRECT_LOAD : DSDIR_Pseudo<"lds_direct_load", LDSDIR_getIns<1>.ret, 1>; +def LDS_PARAM_LOAD : DSDIR_Pseudo<"lds_param_load", LDSDIR_getIns<0>.ret, 0>; + +def : GCNPat < +  (f32 (int_amdgcn_lds_direct_load M0)), +  (LDS_DIRECT_LOAD 0) +>; + +def : GCNPat < +  (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)), +  (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0) +>; + +} // End SubtargetPredicate = isGFX11Only + +let SubtargetPredicate = isGFX12Plus in { + +def DS_DIRECT_LOAD : DSDIR_Pseudo<"ds_direct_load", VDSDIR_getIns<1>.ret, 1>; +def DS_PARAM_LOAD : DSDIR_Pseudo<"ds_param_load", VDSDIR_getIns<0>.ret, 0>; + +def : GCNPat < +  (f32 (int_amdgcn_lds_direct_load M0)), +  (DS_DIRECT_LOAD 0, 1) +>; + +def : GCNPat < +  (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)), +  (DS_PARAM_LOAD timm:$attr, timm:$attrchan, 0, 1) +>; + +} // End SubtargetPredicate = isGFX12Only + +//===----------------------------------------------------------------------===// +// GFX11 +//===----------------------------------------------------------------------===// + +multiclass DSDIR_Real_gfx11<bits<2> op, +                            DSDIR_Pseudo lds = !cast<DSDIR_Pseudo>(NAME)> { +  def _gfx11 : DSDIR_Real<lds, lds.InOperandList, +                          LDSDIR_getAsm<lds.is_direct>.ret, +                          SIEncodingFamily.GFX11>, +               LDSDIRe<op, lds.is_direct> { +    let AssemblerPredicate = isGFX11Only; +    let DecoderNamespace = "GFX11"; +  } +} + +defm LDS_PARAM_LOAD : DSDIR_Real_gfx11<0x0>; +defm LDS_DIRECT_LOAD : DSDIR_Real_gfx11<0x1>; + +//===----------------------------------------------------------------------===// +// GFX12+ +//===----------------------------------------------------------------------===// + +multiclass DSDIR_Real_gfx12<bits<2> op, +                            DSDIR_Pseudo lds = !cast<DSDIR_Pseudo>(NAME)> { +  def _gfx12 : DSDIR_Real<lds, lds.InOperandList, +                          VDSDIR_getAsm<lds.is_direct>.ret, +                          SIEncodingFamily.GFX12>, +               VDSDIRe<op, lds.is_direct> { +    let AssemblerPredicate = isGFX12Plus; +    let DecoderNamespace = "GFX12"; +  } +} + +defm DS_PARAM_LOAD : DSDIR_Real_gfx12<0x0>; +defm DS_DIRECT_LOAD : DSDIR_Real_gfx12<0x1>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td index bc9049b4ef33..3cccd8c50e66 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -19,7 +19,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt    // Most instruction load and store data, so set this as the default.    let mayLoad = 1;    let mayStore = 1; -  let maybeAtomic = 1;    let hasSideEffects = 0;    let SchedRW = [WriteLDS]; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 67be7b0fd642..9dff3f6c2efd 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,    DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm,      \                 false, ImmWidth) +#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth)         \ +  DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth) +  // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)  // and decode using 'enum10' from decodeSrcOp.  #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth)          \ @@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)  DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)  DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16) +  DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)  DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)  DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64) diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td index ff1d661ef6fe..4cfee7d013ef 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td @@ -20,6 +20,7 @@ class EXPCommon<bit row, bit done, string asm = ""> : InstSI<    let EXP_CNT = 1;    let mayLoad = done;    let mayStore = 1; +  let maybeAtomic = 0;    let UseNamedOperandTable = 1;    let Uses = !if(row, [EXEC, M0], [EXEC]);    let SchedRW = [WriteExport]; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td index 615f8cd54d8f..16a8b770e057 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -60,6 +60,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins,    bits<1> has_sve  = 0; // Scratch VGPR Enable    bits<1> lds = 0;    bits<1> sve = 0; +  bits<1> has_offset = 1;    let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts,      !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace)); @@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :    let Inst{51-50} = cpol{4-3}; // scope    let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?);    let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?); -  let Inst{95-72} = offset; +  let Inst{95-72} = !if(ps.has_offset, offset, ?);  }  class GlobalSaddrTable <bit is_saddr, string Name = ""> { @@ -214,7 +215,6 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass,    let has_saddr = HasSaddr;    let enabled_saddr = EnableSaddr;    let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", ""); -  let maybeAtomic = 1;    let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");    let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); @@ -236,7 +236,6 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,    let has_saddr = HasSaddr;    let enabled_saddr = EnableSaddr;    let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", ""); -  let maybeAtomic = 1;  }  multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { @@ -262,7 +261,6 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,    let has_vaddr = 0;    let has_saddr = 1;    let enabled_saddr = EnableSaddr; -  let maybeAtomic = 1;    let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");    let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); @@ -329,7 +327,6 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,    let has_vaddr = 0;    let has_saddr = 1;    let enabled_saddr = EnableSaddr; -  let maybeAtomic = 1;    let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");  } @@ -340,6 +337,34 @@ multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass      GlobalSaddrTable<1, opName>;  } +class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = null_frag> : +  FLAT_Pseudo<opName, (outs), (ins CPol:$cpol), "$cpol", [(node)]> { + +  let AsmMatchConverter = ""; + +  let hasSideEffects = 1; +  let mayLoad = 0; +  let mayStore = 0; +  let is_flat_global = 1; + +  let has_offset = 0; +  let has_saddr = 0; +  let enabled_saddr = 0; +  let saddr_value = 0; +  let has_vdst = 0; +  let has_data = 0; +  let has_vaddr = 0; +  let has_glc = 0; +  let has_dlc = 0; +  let glcValue = 0; +  let dlcValue = 0; +  let has_sccb = 0; +  let sccbValue = 0; +  let has_sve = 0; +  let lds = 0; +  let sve = 0; +} +  class FlatScratchInst <string sv_op, string mode> {    string SVOp = sv_op;    string Mode = mode; @@ -372,7 +397,6 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,    let has_sve = EnableSVE;    let sve = EnableVaddr;    let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"))); -  let maybeAtomic = 1;    let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");    let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); @@ -401,7 +425,6 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En    let has_sve = EnableSVE;    let sve = EnableVaddr;    let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"))); -  let maybeAtomic = 1;  }  multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> { @@ -491,7 +514,6 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins,      let has_vdst = 0;      let has_sccb  = 1;      let sccbValue = 0; -    let maybeAtomic = 1;      let IsAtomicNoRet = 1;  } @@ -928,6 +950,10 @@ defm GLOBAL_LOAD_LDS_DWORD  : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwor  let SubtargetPredicate = isGFX12Plus in {    defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>; + +  def GLOBAL_INV    : FLAT_Global_Invalidate_Writeback<"global_inv">; +  def GLOBAL_WB     : FLAT_Global_Invalidate_Writeback<"global_wb">; +  def GLOBAL_WBINV  : FLAT_Global_Invalidate_Writeback<"global_wbinv">;  } // End SubtargetPredicate = isGFX12Plus  } // End is_flat_global = 1 @@ -2662,6 +2688,10 @@ defm GLOBAL_ATOMIC_MAX_NUM_F32     : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_A  defm GLOBAL_ATOMIC_ADD_F32         : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;  defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">; +defm GLOBAL_INV                    : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">; +defm GLOBAL_WB                     : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">; +defm GLOBAL_WBINV                  : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">; +  // ENC_VSCRATCH.  defm SCRATCH_LOAD_U8               : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;  defm SCRATCH_LOAD_I8               : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 91a709303269..f6f37f5170a4 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -176,6 +176,7 @@ protected:    bool HasGetWaveIdInst = false;    bool HasSMemTimeInst = false;    bool HasShaderCyclesRegister = false; +  bool HasShaderCyclesHiLoRegisters = false;    bool HasVOP3Literal = false;    bool HasNoDataDepHazard = false;    bool FlatAddressSpace = false; @@ -682,6 +683,8 @@ public:    bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } +  bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } +    bool hasUnpackedD16VMem() const {      return HasUnpackedD16VMem;    } @@ -819,6 +822,10 @@ public:      return HasShaderCyclesRegister;    } +  bool hasShaderCyclesHiLoRegisters() const { +    return HasShaderCyclesHiLoRegisters; +  } +    bool hasVOP3Literal() const {      return HasVOP3Literal;    } @@ -1096,7 +1103,7 @@ public:    bool hasDstSelForwardingHazard() const { return GFX940Insts; }    // Cannot use op_sel with v_dot instructions. -  bool hasDOTOpSelHazard() const { return GFX940Insts; } +  bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; }    // Does not have HW interlocs for VALU writing and then reading SGPRs.    bool hasVDecCoExecHazard() const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td deleted file mode 100644 index 4956a1586774..000000000000 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td +++ /dev/null @@ -1,116 +0,0 @@ -//===-- LDSDIRInstructions.td - LDS Direct Instruction Definitions --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// LDSDIR encoding -//===----------------------------------------------------------------------===// - -class LDSDIRe<bits<2> op, bit is_direct> : Enc32 { -  // encoding fields -  bits<2> attrchan; -  bits<6> attr; -  bits<4> waitvdst; -  bits<8> vdst; - -  // encoding -  let Inst{31-24} = 0xce; // encoding -  let Inst{23-22} = 0x0; // reserved -  let Inst{21-20} = op; -  let Inst{19-16} = waitvdst; -  let Inst{15-10} = !if(is_direct, ?, attr); -  let Inst{9-8} = !if(is_direct, ?, attrchan); -  let Inst{7-0} = vdst; -} - -//===----------------------------------------------------------------------===// -// LDSDIR Classes -//===----------------------------------------------------------------------===// - -class LDSDIR_getIns<bit direct> { -  dag ret = !if(direct, -    (ins wait_vdst:$waitvdst), -    (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst) -  ); -} - -class LDSDIR_Common<string opName, string asm = "", bit direct> : InstSI< -    (outs VGPR_32:$vdst), -    LDSDIR_getIns<direct>.ret, -    asm> { -  let LDSDIR = 1; -  let EXP_CNT = 1; - -  let hasSideEffects = 0; -  let mayLoad = 1; -  let mayStore = 0; - -  string Mnemonic = opName; -  let UseNamedOperandTable = 1; - -  let Uses = [M0, EXEC]; -  let DisableWQM = 0; -  let SchedRW = [WriteLDS]; - -  bit is_direct; -  let is_direct = direct; -} - -class LDSDIR_Pseudo<string opName, bit direct> : -  LDSDIR_Common<opName, "", direct>, -  SIMCInstr<opName, SIEncodingFamily.NONE> { -  let isPseudo = 1; -  let isCodeGenOnly = 1; -} - -class LDSDIR_getAsm<bit direct> { -  string ret = !if(direct, -    " $vdst$waitvdst", -    " $vdst, $attr$attrchan$waitvdst" -  ); -} - -class LDSDIR_Real<bits<2> op, LDSDIR_Pseudo lds, int subtarget> : -  LDSDIR_Common<lds.Mnemonic, -                lds.Mnemonic # LDSDIR_getAsm<lds.is_direct>.ret, -                lds.is_direct>, -  SIMCInstr <lds.Mnemonic, subtarget>, -  LDSDIRe<op, lds.is_direct> { -  let isPseudo = 0; -  let isCodeGenOnly = 0; -} - -//===----------------------------------------------------------------------===// -// LDS Direct Instructions -//===----------------------------------------------------------------------===// - -def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>; -def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>; - -def : GCNPat < -  (f32 (int_amdgcn_lds_direct_load M0)), -  (LDS_DIRECT_LOAD 0) ->; - -def : GCNPat < -  (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)), -  (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0) ->; - -//===----------------------------------------------------------------------===// -// GFX11+ -//===----------------------------------------------------------------------===// - -multiclass LDSDIR_Real_gfx11<bits<2> op, LDSDIR_Pseudo lds = !cast<LDSDIR_Pseudo>(NAME)> { -  def _gfx11 : LDSDIR_Real<op, lds, SIEncodingFamily.GFX11> { -    let AssemblerPredicate = isGFX11Plus; -    let DecoderNamespace = "GFX11"; -  } -} - -defm LDS_PARAM_LOAD : LDSDIR_Real_gfx11<0x0>; -defm LDS_DIRECT_LOAD : LDSDIR_Real_gfx11<0x1>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index edc244db613d..6c7977e22599 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,    }  } -void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, -                                         const MCSubtargetInfo &STI, -                                         raw_ostream &O) { -  int16_t SImm = static_cast<int16_t>(Imm); -  if (isInlinableIntLiteral(SImm)) { -    O << SImm; -    return; -  } - +// This must accept a 32-bit immediate value to correctly handle packed 16-bit +// operations. +static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, +                                  raw_ostream &O) {    if (Imm == 0x3C00) -    O<< "1.0"; +    O << "1.0";    else if (Imm == 0xBC00) -    O<< "-1.0"; +    O << "-1.0";    else if (Imm == 0x3800) -    O<< "0.5"; +    O << "0.5";    else if (Imm == 0xB800) -    O<< "-0.5"; +    O << "-0.5";    else if (Imm == 0x4000) -    O<< "2.0"; +    O << "2.0";    else if (Imm == 0xC000) -    O<< "-2.0"; +    O << "-2.0";    else if (Imm == 0x4400) -    O<< "4.0"; +    O << "4.0";    else if (Imm == 0xC400) -    O<< "-4.0"; -  else if (Imm == 0x3118 && -           STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) { +    O << "-4.0"; +  else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))      O << "0.15915494"; -  } else { -    uint64_t Imm16 = static_cast<uint16_t>(Imm); -    O << formatHex(Imm16); -  } -} +  else +    return false; -void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, -                                           const MCSubtargetInfo &STI, -                                           raw_ostream &O) { -  uint16_t Lo16 = static_cast<uint16_t>(Imm); -  printImmediate16(Lo16, STI, O); +  return true;  } -void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, +void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,                                           const MCSubtargetInfo &STI,                                           raw_ostream &O) { +  int16_t SImm = static_cast<int16_t>(Imm); +  if (isInlinableIntLiteral(SImm)) { +    O << SImm; +    return; +  } + +  uint16_t HImm = static_cast<uint16_t>(Imm); +  if (printImmediateFloat16(HImm, STI, O)) +    return; + +  uint64_t Imm16 = static_cast<uint16_t>(Imm); +  O << formatHex(Imm16); +} + +void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, +                                           const MCSubtargetInfo &STI, +                                           raw_ostream &O) {    int32_t SImm = static_cast<int32_t>(Imm); -  if (SImm >= -16 && SImm <= 64) { +  if (isInlinableIntLiteral(SImm)) {      O << SImm;      return;    } +  switch (OpType) { +  case AMDGPU::OPERAND_REG_IMM_V2INT16: +  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: +  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: +    if (printImmediateFloat32(Imm, STI, O)) +      return; +    break; +  case AMDGPU::OPERAND_REG_IMM_V2FP16: +  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: +  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: +    if (isUInt<16>(Imm) && +        printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O)) +      return; +    break; +  default: +    llvm_unreachable("bad operand type"); +  } + +  O << formatHex(static_cast<uint64_t>(Imm)); +} + +bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm, +                                              const MCSubtargetInfo &STI, +                                              raw_ostream &O) {    if (Imm == llvm::bit_cast<uint32_t>(0.0f))      O << "0.0";    else if (Imm == llvm::bit_cast<uint32_t>(1.0f)) @@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,             STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))      O << "0.15915494";    else -    O << formatHex(static_cast<uint64_t>(Imm)); +    return false; + +  return true; +} + +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, +                                         const MCSubtargetInfo &STI, +                                         raw_ostream &O) { +  int32_t SImm = static_cast<int32_t>(Imm); +  if (isInlinableIntLiteral(SImm)) { +    O << SImm; +    return; +  } + +  if (printImmediateFloat32(Imm, STI, O)) +    return; + +  O << formatHex(static_cast<uint64_t>(Imm));  }  void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, @@ -639,6 +684,20 @@ void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,    printU4ImmDecOperand(MI, OpNo, O);  } +void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo, +                                        const MCSubtargetInfo &STI, +                                        raw_ostream &O) { +  O << " wait_va_vdst:"; +  printU4ImmDecOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo, +                                        const MCSubtargetInfo &STI, +                                        raw_ostream &O) { +  O << " wait_vm_vsrc:"; +  printU4ImmDecOperand(MI, OpNo, O); +} +  void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,                                      const MCSubtargetInfo &STI,                                      raw_ostream &O) { @@ -741,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,        break;      case AMDGPU::OPERAND_REG_IMM_V2INT16:      case AMDGPU::OPERAND_REG_IMM_V2FP16: -      if (!isUInt<16>(Op.getImm()) && -          STI.hasFeature(AMDGPU::FeatureVOP3Literal)) { -        printImmediate32(Op.getImm(), STI, O); -        break; -      } - -      //  Deal with 16-bit FP inline immediates not working. -      if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) { -        printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O); -        break; -      } -      [[fallthrough]];      case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:      case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: -      printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O); -      break;      case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:      case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: -      printImmediateV216(Op.getImm(), STI, O); +      printImmediateV216(Op.getImm(), OpTy, STI, O);        break;      case MCOI::OPERAND_UNKNOWN:      case MCOI::OPERAND_PCREL: diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 95c26de6299e..e3958f88277d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -88,8 +88,10 @@ private:                             raw_ostream &O);    void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,                          raw_ostream &O); -  void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, -                          raw_ostream &O); +  void printImmediateV216(uint32_t Imm, uint8_t OpType, +                          const MCSubtargetInfo &STI, raw_ostream &O); +  bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI, +                             raw_ostream &O);    void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,                          raw_ostream &O);    void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, @@ -161,6 +163,10 @@ private:                      raw_ostream &O);    void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,                      raw_ostream &O); +  void printWaitVAVDst(const MCInst *MI, unsigned OpNo, +                       const MCSubtargetInfo &STI, raw_ostream &O); +  void printWaitVMVSrc(const MCInst *MI, unsigned OpNo, +                       const MCSubtargetInfo &STI, raw_ostream &O);    void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,                      raw_ostream &O, unsigned N); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index b403d69d9ff1..de1abaf29c56 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,      // which does not have f16 support?      return getLit16Encoding(static_cast<uint16_t>(Imm), STI);    case AMDGPU::OPERAND_REG_IMM_V2INT16: -  case AMDGPU::OPERAND_REG_IMM_V2FP16: { -    if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal)) -      return getLit32Encoding(static_cast<uint32_t>(Imm), STI); -    if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) -      return getLit16Encoding(static_cast<uint16_t>(Imm), STI); -    [[fallthrough]]; -  }    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:    case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: -    return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); +    return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm)) +        .value_or(255); +  case AMDGPU::OPERAND_REG_IMM_V2FP16:    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: -  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { -    uint16_t Lo16 = static_cast<uint16_t>(Imm); -    uint32_t Encoding = getLit16Encoding(Lo16, STI); -    return Encoding; -  } +  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: +    return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm)) +        .value_or(255);    case AMDGPU::OPERAND_KIMM32:    case AMDGPU::OPERAND_KIMM16:      return MO.getImm(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 9a2fb0bc37b2..674fd04f2fc1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1651,7 +1651,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],    BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);    for (unsigned i = 0; i < 4; i++) { -    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); +    unsigned Idx = Swz[i]->getAsZExtVal();      if (SwizzleRemap.contains(Idx))        Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);    } @@ -1659,7 +1659,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],    SwizzleRemap.clear();    BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);    for (unsigned i = 0; i < 4; i++) { -    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); +    unsigned Idx = Swz[i]->getAsZExtVal();      if (SwizzleRemap.contains(Idx))        Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);    } @@ -1780,7 +1780,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,      // Check that we know which element is being inserted      if (!isa<ConstantSDNode>(EltNo))        return SDValue(); -    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); +    unsigned Elt = EltNo->getAsZExtVal();      // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially      // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the @@ -2021,7 +2021,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,    }    case R600::MOV_IMM_GLOBAL_ADDR:      // Check if the Imm slot is used. Taken from below. -    if (cast<ConstantSDNode>(Imm)->getZExtValue()) +    if (Imm->getAsZExtVal())        return false;      Imm = Src.getOperand(0);      Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 709de612d81d..aa7639a0f186 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {    assert(Old.isReg() && Fold.isImm());    if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || -      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) || -      isUInt<16>(Fold.ImmToFold) || -      !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm())) +      (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))      return false;    unsigned Opcode = MI->getOpcode(); @@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {    MachineOperand &Old = MI->getOperand(Fold.UseOpNo);    unsigned Opcode = MI->getOpcode();    int OpNo = MI->getOperandNo(&Old); +  uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; + +  // If the literal can be inlined as-is, apply it and short-circuit the +  // tests below. The main motivation for this is to avoid unintuitive +  // uses of opsel. +  if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) { +    Old.ChangeToImmediate(Fold.ImmToFold); +    return true; +  } -  // Set op_sel/op_sel_hi on this operand or bail out if op_sel is -  // already set. +  // Refer to op_sel/op_sel_hi and check if we can change the immediate and +  // op_sel in a way that allows an inline constant.    int ModIdx = -1; -  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) +  unsigned SrcIdx = ~0; +  if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {      ModIdx = AMDGPU::OpName::src0_modifiers; -  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) +    SrcIdx = 0; +  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {      ModIdx = AMDGPU::OpName::src1_modifiers; -  else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) +    SrcIdx = 1; +  } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {      ModIdx = AMDGPU::OpName::src2_modifiers; +    SrcIdx = 2; +  }    assert(ModIdx != -1);    ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);    MachineOperand &Mod = MI->getOperand(ModIdx); -  unsigned Val = Mod.getImm(); -  if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) +  unsigned ModVal = Mod.getImm(); + +  uint16_t ImmLo = static_cast<uint16_t>( +      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); +  uint16_t ImmHi = static_cast<uint16_t>( +      Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); +  uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo; +  unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + +  // Helper function that attempts to inline the given value with a newly +  // chosen opsel pattern. +  auto tryFoldToInline = [&](uint32_t Imm) -> bool { +    if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) { +      Mod.setImm(NewModVal | SISrcMods::OP_SEL_1); +      Old.ChangeToImmediate(Imm); +      return true; +    } + +    // Try to shuffle the halves around and leverage opsel to get an inline +    // constant. +    uint16_t Lo = static_cast<uint16_t>(Imm); +    uint16_t Hi = static_cast<uint16_t>(Imm >> 16); +    if (Lo == Hi) { +      if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) { +        Mod.setImm(NewModVal); +        Old.ChangeToImmediate(Lo); +        return true; +      } + +      if (static_cast<int16_t>(Lo) < 0) { +        int32_t SExt = static_cast<int16_t>(Lo); +        if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) { +          Mod.setImm(NewModVal); +          Old.ChangeToImmediate(SExt); +          return true; +        } +      } + +      // This check is only useful for integer instructions +      if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 || +          OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) { +        if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) { +          Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); +          Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16); +          return true; +        } +      } +    } else { +      uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi; +      if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) { +        Mod.setImm(NewModVal | SISrcMods::OP_SEL_0); +        Old.ChangeToImmediate(Swapped); +        return true; +      } +    } +      return false; +  }; -  // Only apply the following transformation if that operand requires -  // a packed immediate. -  // If upper part is all zero we do not need op_sel_hi. -  if (!(Fold.ImmToFold & 0xffff)) { -    MachineOperand New = -        MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff); -    if (!TII->isOperandLegal(*MI, OpNo, &New)) -      return false; -    Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); -    Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); -    Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); +  if (tryFoldToInline(Imm))      return true; + +  // Replace integer addition by subtraction and vice versa if it allows +  // folding the immediate to an inline constant. +  // +  // We should only ever get here for SrcIdx == 1 due to canonicalization +  // earlier in the pipeline, but we double-check here to be safe / fully +  // general. +  bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16; +  bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16; +  if (SrcIdx == 1 && (IsUAdd || IsUSub)) { +    unsigned ClampIdx = +        AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp); +    bool Clamp = MI->getOperand(ClampIdx).getImm() != 0; + +    if (!Clamp) { +      uint16_t NegLo = -static_cast<uint16_t>(Imm); +      uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16); +      uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo; + +      if (tryFoldToInline(NegImm)) { +        unsigned NegOpcode = +            IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16; +        MI->setDesc(TII->get(NegOpcode)); +        return true; +      } +    }    } -  MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff); -  if (!TII->isOperandLegal(*MI, OpNo, &New)) -    return false; -  Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); -  Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); -  return true; + +  return false;  }  bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { @@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {    MachineOperand &Old = MI->getOperand(Fold.UseOpNo);    assert(Old.isReg()); -  if (Fold.isImm() && canUseImmWithOpSel(Fold)) -    return tryFoldImmWithOpSel(Fold); +  if (Fold.isImm() && canUseImmWithOpSel(Fold)) { +    if (tryFoldImmWithOpSel(Fold)) +      return true; + +    // We can't represent the candidate as an inline constant. Try as a literal +    // with the original opsel, checking constant bus limitations. +    MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold); +    int OpNo = MI->getOperandNo(&Old); +    if (!TII->isOperandLegal(*MI, OpNo, &New)) +      return false; +    Old.ChangeToImmediate(Fold.ImmToFold); +    return true; +  }    if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {      MachineBasicBlock *MBB = MI->getParent(); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0e857e6ac71b..6ddc7e864fb2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -151,22 +151,29 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      if (Subtarget->useRealTrue16Insts()) {        addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);        addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); +      addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);      } else {        addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);        addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); +      addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);      }      // Unless there are also VOP3P operations, not operations are really legal.      addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);      addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); +    addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);      addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);      addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); +    addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);      addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);      addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); +    addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);      addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);      addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); +    addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);      addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);      addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); +    addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);    }    addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -196,6 +203,41 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                        MVT::i1,     MVT::v32i32},                       Custom); +  if (isTypeLegal(MVT::bf16)) { +    for (unsigned Opc : +         {ISD::FADD,     ISD::FSUB,       ISD::FMUL,    ISD::FDIV, +          ISD::FREM,     ISD::FMA,        ISD::FMINNUM, ISD::FMAXNUM, +          ISD::FMINIMUM, ISD::FMAXIMUM,   ISD::FSQRT,   ISD::FCBRT, +          ISD::FSIN,     ISD::FCOS,       ISD::FPOW,    ISD::FPOWI, +          ISD::FLDEXP,   ISD::FFREXP,     ISD::FLOG,    ISD::FLOG2, +          ISD::FLOG10,   ISD::FEXP,       ISD::FEXP2,   ISD::FEXP10, +          ISD::FCEIL,    ISD::FTRUNC,     ISD::FRINT,   ISD::FNEARBYINT, +          ISD::FROUND,   ISD::FROUNDEVEN, ISD::FFLOOR,  ISD::FCANONICALIZE, +          ISD::SETCC}) { +      // FIXME: The promoted to type shouldn't need to be explicit +      setOperationAction(Opc, MVT::bf16, Promote); +      AddPromotedToType(Opc, MVT::bf16, MVT::f32); +    } + +    setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand); + +    setOperationAction(ISD::SELECT, MVT::bf16, Promote); +    AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16); + +    // TODO: Could make these legal +    setOperationAction(ISD::FABS, MVT::bf16, Expand); +    setOperationAction(ISD::FNEG, MVT::bf16, Expand); +    setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); + +    // We only need to custom lower because we can't specify an action for bf16 +    // sources. +    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); +    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + +    setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote); +    AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16); +  } +    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);    setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);    setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); @@ -271,13 +313,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    // We only support LOAD/STORE and vector manipulation ops for vectors    // with > 4 elements.    for (MVT VT : -       {MVT::v8i32,  MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32, -        MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, -        MVT::v16i32, MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16, -        MVT::v4f16,  MVT::v3i64,  MVT::v3f64,  MVT::v6i32,  MVT::v6f32, -        MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,  MVT::v8i16, -        MVT::v8f16,  MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, -        MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) { +       {MVT::v8i32,   MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32, +        MVT::v10f32,  MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, +        MVT::v16i32,  MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16, +        MVT::v4f16,   MVT::v4bf16, MVT::v3i64,  MVT::v3f64,  MVT::v6i32, +        MVT::v6f32,   MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64, +        MVT::v8i16,   MVT::v8f16,  MVT::v8bf16, MVT::v16i16, MVT::v16f16, +        MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, +        MVT::v32i16,  MVT::v32f16, MVT::v32bf16}) {      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {        switch (Op) {        case ISD::LOAD: @@ -383,13 +426,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                       {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},                       Expand); -  setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom); +  setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, +                     Custom);    // Avoid stack access for these.    // TODO: Generalize to more vector types.    setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, -                     {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, -                      MVT::v4i16, MVT::v4f16}, +                     {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8, +                      MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},                       Custom);    // Deal with vec3 vector operations when widened to vec4. @@ -498,6 +542,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);    setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand); +  // Custom lower these because we can't specify a rule based on an illegal +  // source bf16. +  setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom); +  setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom); +    if (Subtarget->has16BitInsts()) {      setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,                          ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, @@ -524,9 +573,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);      setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom); +    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); +    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); + +    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);      // F16 - Constant Actions.      setOperationAction(ISD::ConstantFP, MVT::f16, Legal); +    setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);      // F16 - Load/Store Actions.      setOperationAction(ISD::LOAD, MVT::f16, Promote); @@ -534,16 +588,23 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      setOperationAction(ISD::STORE, MVT::f16, Promote);      AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); +    // BF16 - Load/Store Actions. +    setOperationAction(ISD::LOAD, MVT::bf16, Promote); +    AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16); +    setOperationAction(ISD::STORE, MVT::bf16, Promote); +    AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16); +      // F16 - VOP1 Actions.      setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,                          ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},                         MVT::f16, Custom); -    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);      setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); +    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);      // F16 - VOP2 Actions. -    setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand); +    setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16}, +                       Expand);      setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);      setOperationAction(ISD::FFREXP, MVT::f16, Custom);      setOperationAction(ISD::FDIV, MVT::f16, Custom); @@ -554,8 +615,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,        setOperationAction(ISD::FMAD, MVT::f16, Legal);      for (MVT VT : -         {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, -          MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) { +         {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16, +          MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, +          MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {        for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {          switch (Op) {          case ISD::LOAD: @@ -587,7 +649,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      // XXX - Do these do anything? Vector constants turn into build_vector.      setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal); -    setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal); +    setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, +                       Legal);      setOperationAction(ISD::STORE, MVT::v2i16, Promote);      AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); @@ -610,16 +673,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);      setOperationAction(ISD::LOAD, MVT::v4f16, Promote);      AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); +    setOperationAction(ISD::LOAD, MVT::v4bf16, Promote); +    AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);      setOperationAction(ISD::STORE, MVT::v4i16, Promote);      AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);      setOperationAction(ISD::STORE, MVT::v4f16, Promote);      AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); +    setOperationAction(ISD::STORE, MVT::v4bf16, Promote); +    AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);      setOperationAction(ISD::LOAD, MVT::v8i16, Promote);      AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);      setOperationAction(ISD::LOAD, MVT::v8f16, Promote);      AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); +    setOperationAction(ISD::LOAD, MVT::v8bf16, Promote); +    AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);      setOperationAction(ISD::STORE, MVT::v4i16, Promote);      AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); @@ -630,26 +699,36 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,      AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);      setOperationAction(ISD::STORE, MVT::v8f16, Promote);      AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); +    setOperationAction(ISD::STORE, MVT::v8bf16, Promote); +    AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);      setOperationAction(ISD::LOAD, MVT::v16i16, Promote);      AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);      setOperationAction(ISD::LOAD, MVT::v16f16, Promote);      AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); +    setOperationAction(ISD::LOAD, MVT::v16bf16, Promote); +    AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);      setOperationAction(ISD::STORE, MVT::v16i16, Promote);      AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);      setOperationAction(ISD::STORE, MVT::v16f16, Promote);      AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); +    setOperationAction(ISD::STORE, MVT::v16bf16, Promote); +    AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);      setOperationAction(ISD::LOAD, MVT::v32i16, Promote);      AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);      setOperationAction(ISD::LOAD, MVT::v32f16, Promote);      AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); +    setOperationAction(ISD::LOAD, MVT::v32bf16, Promote); +    AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);      setOperationAction(ISD::STORE, MVT::v32i16, Promote);      AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);      setOperationAction(ISD::STORE, MVT::v32f16, Promote);      AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); +    setOperationAction(ISD::STORE, MVT::v32bf16, Promote); +    AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);      setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},                         MVT::v2i32, Expand); @@ -662,7 +741,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                         MVT::v8i32, Expand);      if (!Subtarget->hasVOP3PInsts()) -      setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom); +      setOperationAction(ISD::BUILD_VECTOR, +                         {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);      setOperationAction(ISD::FNEG, MVT::v2f16, Legal);      // This isn't really legal, but this avoids the legalizer unrolling it (and @@ -680,8 +760,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                         {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},                         Expand); -    for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, -                      MVT::v32i16, MVT::v32f16}) { +    for (MVT Vec16 : +         {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, +          MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {        setOperationAction(            {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},            Vec16, Custom); @@ -699,7 +780,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                          ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},                         MVT::v2f16, Legal); -    setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16}, +    setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},                         Custom);      setOperationAction(ISD::VECTOR_SHUFFLE, @@ -724,7 +805,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,                         Custom);      setOperationAction(ISD::FEXP, MVT::v2f16, Custom); -    setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom); +    setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, +                       Custom);      if (Subtarget->hasPackedFP32Ops()) {        setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, @@ -750,13 +832,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,    }    setOperationAction(ISD::SELECT, -                     {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, -                      MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, -                      MVT::v32i16, MVT::v32f16}, +                     {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8, +                      MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16, +                      MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16, +                      MVT::v32f16, MVT::v32bf16},                       Custom);    setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); +  if (Subtarget->hasScalarSMulU64()) +    setOperationAction(ISD::MUL, MVT::i64, Custom); +    if (Subtarget->hasMad64_32())      setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); @@ -3902,6 +3988,26 @@ SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {    return Op;  } +// Work around DAG legality rules only based on the result type. +SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { +  bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND; +  SDValue Src = Op.getOperand(IsStrict ? 1 : 0); +  EVT SrcVT = Src.getValueType(); + +  if (SrcVT.getScalarType() != MVT::bf16) +    return Op; + +  SDLoc SL(Op); +  SDValue BitCast = +      DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src); + +  EVT DstVT = Op.getValueType(); +  if (IsStrict) +    llvm_unreachable("Need STRICT_BF16_TO_FP"); + +  return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast); +} +  Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,                                               const MachineFunction &MF) const {    Register Reg = StringSwitch<Register>(RegName) @@ -4825,6 +4931,48 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(      MI.eraseFromParent();      return BB;    } +  case AMDGPU::GET_SHADERCYCLESHILO: { +    assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); +    MachineRegisterInfo &MRI = MF->getRegInfo(); +    const DebugLoc &DL = MI.getDebugLoc(); +    // The algorithm is: +    // +    // hi1 = getreg(SHADER_CYCLES_HI) +    // lo1 = getreg(SHADER_CYCLES_LO) +    // hi2 = getreg(SHADER_CYCLES_HI) +    // +    // If hi1 == hi2 then there was no overflow and the result is hi2:lo1. +    // Otherwise there was overflow and the result is hi2:0. In both cases the +    // result should represent the actual time at some point during the sequence +    // of three getregs. +    Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); +    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) +        .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI, +                                           0, 32)); +    Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); +    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) +        .addImm( +            AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32)); +    Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); +    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) +        .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI, +                                           0, 32)); +    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) +        .addReg(RegHi1) +        .addReg(RegHi2); +    Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); +    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo) +        .addReg(RegLo1) +        .addImm(0); +    BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE)) +        .add(MI.getOperand(0)) +        .addReg(RegLo) +        .addImm(AMDGPU::sub0) +        .addReg(RegHi2) +        .addImm(AMDGPU::sub1); +    MI.eraseFromParent(); +    return BB; +  }    case AMDGPU::SI_INDIRECT_SRC_V1:    case AMDGPU::SI_INDIRECT_SRC_V2:    case AMDGPU::SI_INDIRECT_SRC_V4: @@ -5305,7 +5453,9 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,    assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||           VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||           VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || -         VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16); +         VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 || +         VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 || +         VT == MVT::v32bf16);    SDValue Lo0, Hi0;    SDValue Op0 = Op.getOperand(0); @@ -5424,7 +5574,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {    case ISD::SRL:    case ISD::ADD:    case ISD::SUB: -  case ISD::MUL:    case ISD::SMIN:    case ISD::SMAX:    case ISD::UMIN: @@ -5438,6 +5587,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {    case ISD::SADDSAT:    case ISD::SSUBSAT:      return splitBinaryVectorOp(Op, DAG); +  case ISD::MUL: +    return lowerMUL(Op, DAG);    case ISD::SMULO:    case ISD::UMULO:      return lowerXMULO(Op, DAG); @@ -5452,6 +5603,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {      return lowerGET_ROUNDING(Op, DAG);    case ISD::PREFETCH:      return lowerPREFETCH(Op, DAG); +  case ISD::FP_EXTEND: +  case ISD::STRICT_FP_EXTEND: +    return lowerFP_EXTEND(Op, DAG);    }    return SDValue();  } @@ -6090,6 +6244,66 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {    return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);  } +// Custom lowering for vector multiplications and s_mul_u64. +SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { +  EVT VT = Op.getValueType(); + +  // Split vector operands. +  if (VT.isVector()) +    return splitBinaryVectorOp(Op, DAG); + +  assert(VT == MVT::i64 && "The following code is a special for s_mul_u64"); + +  // There are four ways to lower s_mul_u64: +  // +  // 1. If all the operands are uniform, then we lower it as it is. +  // +  // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit +  //    multiplications because there is not a vector equivalent of s_mul_u64. +  // +  // 3. If the cost model decides that it is more efficient to use vector +  //    registers, then we have to split s_mul_u64 in 32-bit multiplications. +  //    This happens in splitScalarSMULU64() in SIInstrInfo.cpp . +  // +  // 4. If the cost model decides to use vector registers and both of the +  //    operands are zero-extended/sign-extended from 32-bits, then we split the +  //    s_mul_u64 in two 32-bit multiplications. The problem is that it is not +  //    possible to check if the operands are zero-extended or sign-extended in +  //    SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with +  //    s_mul_u64_u32_pseudo if both operands are zero-extended and we replace +  //    s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. +  //    If the cost model decides that we have to use vector registers, then +  //    splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/ +  //    s_mul_i64_i32_pseudo in two vector multiplications. If the cost model +  //    decides that we should use scalar registers, then s_mul_u64_u32_pseudo/ +  //    s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in +  //    SIInstrInfo.cpp . + +  if (Op->isDivergent()) +    return SDValue(); + +  SDValue Op0 = Op.getOperand(0); +  SDValue Op1 = Op.getOperand(1); +  // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 +  // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to +  // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. +  KnownBits Op0KnownBits = DAG.computeKnownBits(Op0); +  unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros(); +  KnownBits Op1KnownBits = DAG.computeKnownBits(Op1); +  unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros(); +  SDLoc SL(Op); +  if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32) +    return SDValue( +        DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0); +  unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0); +  unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1); +  if (Op0SignBits >= 33 && Op1SignBits >= 33) +    return SDValue( +        DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0); +  // If all the operands are uniform, then we lower s_mul_u64 as it is. +  return Op; +} +  SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {    EVT VT = Op.getValueType();    SDLoc SL(Op); @@ -6424,7 +6638,7 @@ SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,    EVT InsVT = Ins.getValueType();    EVT EltVT = VecVT.getVectorElementType();    unsigned InsNumElts = InsVT.getVectorNumElements(); -  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); +  unsigned IdxVal = Idx->getAsZExtVal();    SDLoc SL(Op);    if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { @@ -6639,7 +6853,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,    SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);    SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); -  if (ResultVT == MVT::f16) { +  if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {      SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);      return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);    } @@ -6725,8 +6939,8 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,    SDLoc SL(Op);    EVT VT = Op.getValueType(); -  if (VT == MVT::v4i16 || VT == MVT::v4f16 || -      VT == MVT::v8i16 || VT == MVT::v8f16) { +  if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || +      VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {      EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),                                    VT.getVectorNumElements() / 2);      MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits()); @@ -6749,7 +6963,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,      return DAG.getNode(ISD::BITCAST, SL, VT, Blend);    } -  if (VT == MVT::v16i16 || VT == MVT::v16f16) { +  if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {      EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),                                       VT.getVectorNumElements() / 4);      MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); @@ -6770,7 +6984,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,      return DAG.getNode(ISD::BITCAST, SL, VT, Blend);    } -  if (VT == MVT::v32i16 || VT == MVT::v32f16) { +  if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {      EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),                                       VT.getVectorNumElements() / 8);      MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); @@ -6791,7 +7005,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,      return DAG.getNode(ISD::BITCAST, SL, VT, Blend);    } -  assert(VT == MVT::v2f16 || VT == MVT::v2i16); +  assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);    assert(!Subtarget->hasVOP3PInsts() && "this should be legal");    SDValue Lo = Op.getOperand(0); @@ -6890,6 +7104,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,          // Adjust alignment for that dynamic shared memory array.          Function &F = DAG.getMachineFunction().getFunction();          MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV)); +        MFI->setUsesDynamicLDS(true);          return SDValue(              DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);        } @@ -7453,7 +7668,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,      Ops.push_back(IsA16 ? True : False);    if (!Subtarget->hasGFX90AInsts()) {      Ops.push_back(TFE); //tfe -  } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) { +  } else if (TFE->getAsZExtVal()) {      report_fatal_error("TFE is not supported on this GPU");    }    if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) @@ -7590,7 +7805,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,    setBufferOffsets(Offset, DAG, &Ops[3],                     NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); -  uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); +  uint64_t InstOffset = Ops[5]->getAsZExtVal();    for (unsigned i = 0; i < NumLoads; ++i) {      Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);      Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, @@ -14052,11 +14267,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,      EVT VT = N->getValueType(0);      // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) -    if (VT == MVT::v2i16 || VT == MVT::v2f16) { +    if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2f16) {        SDLoc SL(N);        SDValue Src = N->getOperand(0);        EVT EltVT = Src.getValueType(); -      if (EltVT == MVT::f16) +      if (EltVT != MVT::i16)          Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);        SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h index 5bc091d6e84d..92b38ebade62 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -146,6 +146,7 @@ private:    SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;    SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;    SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; +  SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;    SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;    SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; @@ -417,6 +418,7 @@ public:    SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;    SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; +  SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;    Register getRegisterByName(const char* RegName, LLT VT,                               const MachineFunction &MF) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 55ddb540c51e..1cb1d32707f2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1424,6 +1424,12 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(    });  } +static bool isCacheInvOrWBInst(MachineInstr &Inst) { +  auto Opc = Inst.getOpcode(); +  return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || +         Opc == AMDGPU::GLOBAL_WBINV; +} +  void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,                                                 WaitcntBrackets *ScoreBrackets) {    // Now look at the instruction opcode. If it is a memory access @@ -1439,6 +1445,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,        ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);      }    } else if (TII->isFLAT(Inst)) { +    // TODO: Track this properly. +    if (isCacheInvOrWBInst(Inst)) +      return; +      assert(Inst.mayLoadOrStore());      int FlatASCount = 0; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 585a3eb78618..1b66d163714f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -91,7 +91,7 @@ class InstSI <dag outs, dag ins, string asm = "",    field bit VOP3_OPSEL = 0;    // Is it possible for this instruction to be atomic? -  field bit maybeAtomic = 0; +  field bit maybeAtomic = 1;    // This bit indicates that this is a VI instruction which is renamed    // in GFX9. Required for correct mapping from pseudo to MC. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 396d22c7ec18..fee900b3efb2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -338,8 +338,8 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,      if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))        return false; -    Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); -    Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); +    Offset0 = Off0->getAsZExtVal(); +    Offset1 = Off1->getAsZExtVal();      return true;    } @@ -2475,6 +2475,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {      MI.eraseFromParent();      break;    } + +  case AMDGPU::S_MUL_U64_U32_PSEUDO: +  case AMDGPU::S_MUL_I64_I32_PSEUDO: +    MI.setDesc(get(AMDGPU::S_MUL_U64)); +    break;    }    return true;  } @@ -4153,15 +4158,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,    case AMDGPU::OPERAND_REG_IMM_V2INT16:    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:    case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: -    return (isInt<16>(Imm) || isUInt<16>(Imm)) && -           AMDGPU::isInlinableIntLiteral((int16_t)Imm); +    return AMDGPU::isInlinableLiteralV2I16(Imm); +  case AMDGPU::OPERAND_REG_IMM_V2FP16: +  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: +  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: +    return AMDGPU::isInlinableLiteralV2F16(Imm);    case AMDGPU::OPERAND_REG_IMM_FP16:    case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:    case AMDGPU::OPERAND_REG_INLINE_C_FP16: -  case AMDGPU::OPERAND_REG_INLINE_AC_FP16: -  case AMDGPU::OPERAND_REG_IMM_V2FP16: -  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: -  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { +  case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {      if (isInt<16>(Imm) || isUInt<16>(Imm)) {        // A few special case instructions have 16-bit operands on subtargets        // where 16-bit instructions are not legal. @@ -6845,6 +6850,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,      // Default handling      break;    } + +  case AMDGPU::S_MUL_U64: +    // Split s_mul_u64 in 32-bit vector multiplications. +    splitScalarSMulU64(Worklist, Inst, MDT); +    Inst.eraseFromParent(); +    return; + +  case AMDGPU::S_MUL_U64_U32_PSEUDO: +  case AMDGPU::S_MUL_I64_I32_PSEUDO: +    // This is a special case of s_mul_u64 where all the operands are either +    // zero extended or sign extended. +    splitScalarSMulPseudo(Worklist, Inst, MDT); +    Inst.eraseFromParent(); +    return; +    case AMDGPU::S_AND_B64:      splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);      Inst.eraseFromParent(); @@ -7654,6 +7674,180 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,    addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);  } +// There is not a vector equivalent of s_mul_u64. For this reason, we need to +// split the s_mul_u64 in 32-bit vector multiplications. +void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist, +                                     MachineInstr &Inst, +                                     MachineDominatorTree *MDT) const { +  MachineBasicBlock &MBB = *Inst.getParent(); +  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + +  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); +  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + +  MachineOperand &Dest = Inst.getOperand(0); +  MachineOperand &Src0 = Inst.getOperand(1); +  MachineOperand &Src1 = Inst.getOperand(2); +  const DebugLoc &DL = Inst.getDebugLoc(); +  MachineBasicBlock::iterator MII = Inst; + +  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); +  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); +  const TargetRegisterClass *Src0SubRC = +      RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); +  if (RI.isSGPRClass(Src0SubRC)) +    Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); +  const TargetRegisterClass *Src1SubRC = +      RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); +  if (RI.isSGPRClass(Src1SubRC)) +    Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + +  // First, we extract the low 32-bit and high 32-bit values from each of the +  // operands. +  MachineOperand Op0L = +      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); +  MachineOperand Op1L = +      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); +  MachineOperand Op0H = +      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); +  MachineOperand Op1H = +      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + +  // The multilication is done as follows: +  // +  //                            Op1H  Op1L +  //                          * Op0H  Op0L +  //                       -------------------- +  //                       Op1H*Op0L  Op1L*Op0L +  //          + Op1H*Op0H  Op1L*Op0H +  // ----------------------------------------- +  // (Op1H*Op0L + Op1L*Op0H + carry)  Op1L*Op0L +  // +  //  We drop Op1H*Op0H because the result of the multiplication is a 64-bit +  //  value and that would overflow. +  //  The low 32-bit value is Op1L*Op0L. +  //  The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L). + +  Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *Op1L_Op0H = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg) +          .add(Op1L) +          .add(Op0H); + +  Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *Op1H_Op0L = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg) +          .add(Op1H) +          .add(Op0L); + +  Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *Carry = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg) +          .add(Op1L) +          .add(Op0L); + +  MachineInstr *LoHalf = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) +          .add(Op1L) +          .add(Op0L); + +  Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg) +                          .addReg(Op1L_Op0H_Reg) +                          .addReg(Op1H_Op0L_Reg); + +  MachineInstr *HiHalf = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1) +          .addReg(AddReg) +          .addReg(CarryReg); + +  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) +      .addReg(DestSub0) +      .addImm(AMDGPU::sub0) +      .addReg(DestSub1) +      .addImm(AMDGPU::sub1); + +  MRI.replaceRegWith(Dest.getReg(), FullDestReg); + +  // Try to legalize the operands in case we need to swap the order to keep it +  // valid. +  legalizeOperands(*Op1L_Op0H, MDT); +  legalizeOperands(*Op1H_Op0L, MDT); +  legalizeOperands(*Carry, MDT); +  legalizeOperands(*LoHalf, MDT); +  legalizeOperands(*Add, MDT); +  legalizeOperands(*HiHalf, MDT); + +  // Move all users of this moved value. +  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + +// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector +// multiplications. +void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist, +                                        MachineInstr &Inst, +                                        MachineDominatorTree *MDT) const { +  MachineBasicBlock &MBB = *Inst.getParent(); +  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + +  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); +  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); +  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + +  MachineOperand &Dest = Inst.getOperand(0); +  MachineOperand &Src0 = Inst.getOperand(1); +  MachineOperand &Src1 = Inst.getOperand(2); +  const DebugLoc &DL = Inst.getDebugLoc(); +  MachineBasicBlock::iterator MII = Inst; + +  const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); +  const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); +  const TargetRegisterClass *Src0SubRC = +      RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); +  if (RI.isSGPRClass(Src0SubRC)) +    Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); +  const TargetRegisterClass *Src1SubRC = +      RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); +  if (RI.isSGPRClass(Src1SubRC)) +    Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + +  // First, we extract the low 32-bit and high 32-bit values from each of the +  // operands. +  MachineOperand Op0L = +      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); +  MachineOperand Op1L = +      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + +  unsigned Opc = Inst.getOpcode(); +  unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO +                        ? AMDGPU::V_MUL_HI_U32_e64 +                        : AMDGPU::V_MUL_HI_I32_e64; +  MachineInstr *HiHalf = +      BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L); + +  MachineInstr *LoHalf = +      BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) +          .add(Op1L) +          .add(Op0L); + +  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) +      .addReg(DestSub0) +      .addImm(AMDGPU::sub0) +      .addReg(DestSub1) +      .addImm(AMDGPU::sub1); + +  MRI.replaceRegWith(Dest.getReg(), FullDestReg); + +  // Try to legalize the operands in case we need to swap the order to keep it +  // valid. +  legalizeOperands(*HiHalf, MDT); +  legalizeOperands(*LoHalf, MDT); + +  // Move all users of this moved value. +  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} +  void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,                                             MachineInstr &Inst, unsigned Opcode,                                             MachineDominatorTree *MDT) const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 46eee6fae0a5..37ee159362a2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -138,6 +138,12 @@ private:                                  unsigned Opcode,                                  MachineDominatorTree *MDT = nullptr) const; +  void splitScalarSMulU64(SIInstrWorklist &Worklist, MachineInstr &Inst, +                          MachineDominatorTree *MDT) const; + +  void splitScalarSMulPseudo(SIInstrWorklist &Worklist, MachineInstr &Inst, +                             MachineDominatorTree *MDT) const; +    void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst,                              MachineDominatorTree *MDT = nullptr) const; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 173c877b8d29..f07b8fa0ea4c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -860,23 +860,6 @@ def ShiftAmt32Imm : ImmLeaf <i32, [{    return Imm < 32;  }]>; -def getNegV2I16Imm : SDNodeXForm<build_vector, [{ -  return SDValue(packNegConstantV2I16(N, *CurDAG), 0); -}]>; - -def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ -  assert(N->getNumOperands() == 2); -  assert(N->getOperand(0).getValueType().getSizeInBits() == 16); -  SDValue Src0 = N->getOperand(0); -  SDValue Src1 = N->getOperand(1); -  if (Src0 == Src1) -    return isNegInlineImmediate(Src0.getNode()); - -  return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) || -         (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); -}], getNegV2I16Imm>; - -  def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{    return fp16SrcZerosHighBits(N->getOpcode());  }]>; @@ -1144,6 +1127,8 @@ def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;  def wait_vdst : NamedIntOperand<i8, "wait_vdst", "WaitVDST">;  def wait_exp : NamedIntOperand<i8, "wait_exp", "WaitEXP">; +def wait_va_vdst : NamedIntOperand<i8, "wait_va_vdst", "WaitVAVDst">; +def wait_va_vsrc : NamedIntOperand<i8, "wait_vm_vsrc", "WaitVMVSrc">;  class KImmFPOperand<ValueType vt> : ImmOperand<vt> {    let OperandNamespace = "AMDGPU"; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td index 8310c6b57dad..b4bd46d33c1f 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -30,7 +30,7 @@ include "SMInstructions.td"  include "FLATInstructions.td"  include "BUFInstructions.td"  include "EXPInstructions.td" -include "LDSDIRInstructions.td" +include "DSDIRInstructions.td"  include "VINTERPInstructions.td"  //===----------------------------------------------------------------------===// @@ -111,7 +111,6 @@ def ATOMIC_FENCE : SPseudoInstSI<    [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],    "ATOMIC_FENCE $ordering, $scope"> {    let hasSideEffects = 1; -  let maybeAtomic = 1;  }  let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { @@ -316,6 +315,12 @@ def S_USUBO_PSEUDO : SPseudoInstSI <    (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)  >; +let OtherPredicates = [HasShaderCyclesHiLoRegisters] in +def GET_SHADERCYCLESHILO : SPseudoInstSI< +  (outs SReg_64:$sdst), (ins), +  [(set SReg_64:$sdst, (i64 (readcyclecounter)))] +>; +  } // End usesCustomInserter = 1, Defs = [SCC]  let usesCustomInserter = 1 in { @@ -557,6 +562,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),    let hasNoSchedulingInfo = 1;    let FixedSize = 1;    let isMeta = 1; +  let maybeAtomic = 0;  }  // Used as an isel pseudo to directly emit initialization with an @@ -1097,7 +1103,7 @@ def : Pat <  multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {    // f16_to_fp patterns    def : GCNPat < -    (f32 (f16_to_fp i32:$src0)), +    (f32 (any_f16_to_fp i32:$src0)),      (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0)    >; @@ -1122,7 +1128,7 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16    >;    def : GCNPat < -    (f64 (fpextend f16:$src)), +    (f64 (any_fpextend f16:$src)),      (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))    >; @@ -1151,6 +1157,13 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16      (f16 (uint_to_fp i32:$src)),      (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))    >; + +  // This is only used on targets without half support +  // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering +  def : GCNPat < +    (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), +    (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) +  >;  }  let SubtargetPredicate = NotHasTrue16BitInsts in @@ -1515,6 +1528,23 @@ def : BitConvert <v2f16, f32, SReg_32>;  def : BitConvert <f32, v2f16, SReg_32>;  def : BitConvert <v2i16, f32, SReg_32>;  def : BitConvert <f32, v2i16, SReg_32>; +def : BitConvert <v2bf16, i32, SReg_32>; +def : BitConvert <i32, v2bf16, SReg_32>; +def : BitConvert <v2bf16, i32, VGPR_32>; +def : BitConvert <i32, v2bf16, VGPR_32>; +def : BitConvert <v2bf16, v2i16, SReg_32>; +def : BitConvert <v2i16, v2bf16, SReg_32>; +def : BitConvert <v2bf16, v2i16, VGPR_32>; +def : BitConvert <v2i16, v2bf16, VGPR_32>; +def : BitConvert <v2bf16, v2f16, SReg_32>; +def : BitConvert <v2f16, v2bf16, SReg_32>; +def : BitConvert <v2bf16, v2f16, VGPR_32>; +def : BitConvert <v2f16, v2bf16, VGPR_32>; +def : BitConvert <f32, v2bf16, VGPR_32>; +def : BitConvert <v2bf16, f32, VGPR_32>; +def : BitConvert <f32, v2bf16, SReg_32>; +def : BitConvert <v2bf16, f32, SReg_32>; +  // 64-bit bitcast  def : BitConvert <i64, f64, VReg_64>; @@ -1531,6 +1561,19 @@ def : BitConvert <f64, v2i32, VReg_64>;  def : BitConvert <v2i32, f64, VReg_64>;  def : BitConvert <v4i16, v4f16, VReg_64>;  def : BitConvert <v4f16, v4i16, VReg_64>; +def : BitConvert <v4bf16, v2i32, VReg_64>; +def : BitConvert <v2i32, v4bf16, VReg_64>; +def : BitConvert <v4bf16, i64, VReg_64>; +def : BitConvert <i64, v4bf16, VReg_64>; +def : BitConvert <v4bf16, v4i16, VReg_64>; +def : BitConvert <v4i16, v4bf16, VReg_64>; +def : BitConvert <v4bf16, v4f16, VReg_64>; +def : BitConvert <v4f16, v4bf16, VReg_64>; +def : BitConvert <v4bf16, v2f32, VReg_64>; +def : BitConvert <v2f32, v4bf16, VReg_64>; +def : BitConvert <v4bf16, f64, VReg_64>; +def : BitConvert <f64, v4bf16, VReg_64>; +  // FIXME: Make SGPR  def : BitConvert <v2i32, v4f16, VReg_64>; @@ -1590,6 +1633,37 @@ def : BitConvert <v2f64, v8i16, SReg_128>;  def : BitConvert <v2i64, v8f16, SReg_128>;  def : BitConvert <v2f64, v8f16, SReg_128>; +def : BitConvert <v4i32, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v4i32, SReg_128>; +def : BitConvert <v4i32, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v4i32, VReg_128>; + +def : BitConvert <v4f32, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v4f32, SReg_128>; +def : BitConvert <v4f32, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v4f32, VReg_128>; + +def : BitConvert <v8i16, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v8i16, SReg_128>; +def : BitConvert <v8i16, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v8i16, VReg_128>; + +def : BitConvert <v8f16, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v8f16, SReg_128>; +def : BitConvert <v8f16, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v8f16, VReg_128>; + +def : BitConvert <v2f64, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v2f64, SReg_128>; +def : BitConvert <v2f64, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v2f64, VReg_128>; + +def : BitConvert <v2i64, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v2i64, SReg_128>; +def : BitConvert <v2i64, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v2i64, VReg_128>; + +  // 160-bit bitcast  def : BitConvert <v5i32, v5f32, SReg_160>;  def : BitConvert <v5f32, v5i32, SReg_160>; @@ -1654,6 +1728,31 @@ def : BitConvert <v4i64, v16i16, VReg_256>;  def : BitConvert <v4f64, v16f16, VReg_256>;  def : BitConvert <v4f64, v16i16, VReg_256>; + +def : BitConvert <v8i32, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v8i32, VReg_256>; +def : BitConvert <v8f32, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v8f32, VReg_256>; +def : BitConvert <v4i64, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v4i64, VReg_256>; +def : BitConvert <v4f64, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v4f64, VReg_256>; + + + +def : BitConvert <v16i16, v16bf16, SReg_256>; +def : BitConvert <v16bf16, v16i16, SReg_256>; +def : BitConvert <v16i16, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v16i16, VReg_256>; + +def : BitConvert <v16f16, v16bf16, SReg_256>; +def : BitConvert <v16bf16, v16f16, SReg_256>; +def : BitConvert <v16f16, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v16f16, VReg_256>; + + + +  // 288-bit bitcast  def : BitConvert <v9i32, v9f32, SReg_288>;  def : BitConvert <v9f32, v9i32, SReg_288>; @@ -1702,6 +1801,38 @@ def : BitConvert <v8f64,  v16f32, VReg_512>;  def : BitConvert <v16f32, v8i64,  VReg_512>;  def : BitConvert <v16f32, v8f64,  VReg_512>; + + +def : BitConvert <v32bf16, v32i16, VReg_512>; +def : BitConvert <v32i16, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v32i16, SReg_512>; +def : BitConvert <v32i16, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v32f16, VReg_512>; +def : BitConvert <v32f16, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v32f16, SReg_512>; +def : BitConvert <v32f16, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v16i32, VReg_512>; +def : BitConvert <v16i32, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v16i32, SReg_512>; +def : BitConvert <v16i32, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v16f32, VReg_512>; +def : BitConvert <v16f32, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v16f32, SReg_512>; +def : BitConvert <v16f32, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v8f64, VReg_512>; +def : BitConvert <v8f64, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v8f64, SReg_512>; +def : BitConvert <v8f64, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v8i64, VReg_512>; +def : BitConvert <v8i64, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v8i64, SReg_512>; +def : BitConvert <v8i64, v32bf16, SReg_512>; +  // 1024-bit bitcast  def : BitConvert <v32i32, v32f32, VReg_1024>;  def : BitConvert <v32f32, v32i32, VReg_1024>; @@ -1958,19 +2089,21 @@ def : GCNPat <    let SubtargetPredicate = HasPackedFP32Ops;  } +foreach fp16vt = [f16, bf16] in { +  def : GCNPat < -  (fcopysign f16:$src0, f16:$src1), +  (fcopysign fp16vt:$src0, fp16vt:$src1),    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)  >;  def : GCNPat < -  (fcopysign f32:$src0, f16:$src1), +  (fcopysign f32:$src0, fp16vt:$src1),    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,               (V_LSHLREV_B32_e64 (i32 16), $src1))  >;  def : GCNPat < -  (fcopysign f64:$src0, f16:$src1), +  (fcopysign f64:$src0, fp16vt:$src1),    (REG_SEQUENCE SReg_64,      (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,      (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), @@ -1978,16 +2111,17 @@ def : GCNPat <  >;  def : GCNPat < -  (fcopysign f16:$src0, f32:$src1), +  (fcopysign fp16vt:$src0, f32:$src1),    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,               (V_LSHRREV_B32_e64 (i32 16), $src1))  >;  def : GCNPat < -  (fcopysign f16:$src0, f64:$src1), +  (fcopysign fp16vt:$src0, f64:$src1),    (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,               (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))  >; +} // End foreach fp16vt = [f16, bf16]  /********** ================== **********/  /********** Immediate Patterns **********/ @@ -2026,6 +2160,11 @@ def : GCNPat <    (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))  >; +def : GCNPat < +  (VGPRImm<(bf16 fpimm)>:$imm), +  (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm))) +>; +  // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit  // immediate and wil be expanded as needed, but we will only use these patterns  // for values which can be encoded. @@ -2060,6 +2199,11 @@ def : GCNPat <  >;  def : GCNPat < +  (bf16 fpimm:$imm), +  (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : GCNPat <    (p5 frameindex:$fi),    (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))  >; @@ -3741,6 +3885,18 @@ def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {    let mayStore = 0;  } +def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction { +  let OutOperandList = (outs type0:$dst); +  let InOperandList = (ins type0:$src0, type0:$src1); +  let hasSideEffects = 0; +} + +def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction { +  let OutOperandList = (outs type0:$dst); +  let InOperandList = (ins type0:$src0, type0:$src1); +  let hasSideEffects = 0; +} +  // This is equivalent to the G_INTRINSIC*, but the operands may have  // been legalized depending on the subtarget requirements.  def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 10ec54d3317f..6d749ad1ad24 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -578,6 +578,14 @@ public:                                        bool IsNonTemporal) const override;  }; +class SIGfx12CacheControl : public SIGfx11CacheControl { +public: +  SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + +  bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, +                     SIAtomicAddrSpace AddrSpace, Position Pos) const override; +}; +  class SIMemoryLegalizer final : public MachineFunctionPass {  private: @@ -857,7 +865,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {      return std::make_unique<SIGfx7CacheControl>(ST);    if (Generation < AMDGPUSubtarget::GFX11)      return std::make_unique<SIGfx10CacheControl>(ST); -  return std::make_unique<SIGfx11CacheControl>(ST); +  if (Generation < AMDGPUSubtarget::GFX12) +    return std::make_unique<SIGfx11CacheControl>(ST); +  return std::make_unique<SIGfx12CacheControl>(ST);  }  bool SIGfx6CacheControl::enableLoadCacheBypass( @@ -1423,7 +1433,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,    bool Changed = false;    MachineBasicBlock &MBB = *MI->getParent(); -  DebugLoc DL = MI->getDebugLoc(); +  const DebugLoc &DL = MI->getDebugLoc();    if (Pos == Position::AFTER)      ++MI; @@ -2132,6 +2142,62 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(    return Changed;  } +bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, +                                        SIAtomicScope Scope, +                                        SIAtomicAddrSpace AddrSpace, +                                        Position Pos) const { +  if (!InsertCacheInv) +    return false; + +  MachineBasicBlock &MBB = *MI->getParent(); +  DebugLoc DL = MI->getDebugLoc(); + +  /// The scratch address space does not need the global memory cache +  /// to be flushed as all memory operations by the same thread are +  /// sequentially consistent, and no other thread can access scratch +  /// memory. + +  /// Other address spaces do not have a cache. +  if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) +    return false; + +  AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; +  switch (Scope) { +  case SIAtomicScope::SYSTEM: +    ScopeImm = AMDGPU::CPol::SCOPE_SYS; +    break; +  case SIAtomicScope::AGENT: +    ScopeImm = AMDGPU::CPol::SCOPE_DEV; +    break; +  case SIAtomicScope::WORKGROUP: +    // In WGP mode the waves of a work-group can be executing on either CU of +    // the WGP. Therefore we need to invalidate the L0 which is per CU. +    // Otherwise in CU mode all waves of a work-group are on the same CU, and so +    // the L0 does not need to be invalidated. +    if (ST.isCuModeEnabled()) +      return false; + +    ScopeImm = AMDGPU::CPol::SCOPE_SE; +    break; +  case SIAtomicScope::WAVEFRONT: +  case SIAtomicScope::SINGLETHREAD: +    // No cache to invalidate. +    return false; +  default: +    llvm_unreachable("Unsupported synchronization scope"); +  } + +  if (Pos == Position::AFTER) +    ++MI; + +  BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); + +  if (Pos == Position::AFTER) +    --MI; + +  return true; +} +  bool SIMemoryLegalizer::removeAtomicPseudoMIs() {    if (AtomicPseudoMIs.empty())      return false; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index c94b894c5841..f42af89cf5e6 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -916,7 +916,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;  defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;  let GlobalPriority = true in { -defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16], SGPR_512Regs, TTMP_512Regs>;  defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;  } @@ -970,7 +970,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;  defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;  let GlobalPriority = true in { -defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>; +defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16], (add VGPR_512)>;  defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;  } @@ -1152,11 +1152,11 @@ class RegOrF32 <string RegisterClass, string OperandTypePrefix>  class RegOrV2B16 <string RegisterClass, string OperandTypePrefix>    : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16", -                     !subst("_v2b16", "V2B16", NAME), "_Imm16">; +                     !subst("_v2b16", "V2B16", NAME), "_ImmV2I16">;  class RegOrV2F16 <string RegisterClass, string OperandTypePrefix>    : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16", -                     !subst("_v2f16", "V2F16", NAME), "_Imm16">; +                     !subst("_v2f16", "V2F16", NAME), "_ImmV2F16">;  class RegOrF64 <string RegisterClass, string OperandTypePrefix>    : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64", diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 59d6ccf513bb..5e6c34992930 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -553,7 +553,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,          }          continue;        } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || -                 Opcode == AMDGPU::LDS_DIRECT_LOAD) { +                 Opcode == AMDGPU::DS_PARAM_LOAD || +                 Opcode == AMDGPU::LDS_DIRECT_LOAD || +                 Opcode == AMDGPU::DS_DIRECT_LOAD) {          // Mark these STRICTWQM, but only for the instruction, not its operands.          // This avoid unnecessarily marking M0 as requiring WQM.          InstrInfo &II = Instructions[&MI]; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td index 3297847b0360..fc29ce8d71f2 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -29,6 +29,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt    let mayStore = 0;    let mayLoad = 1;    let hasSideEffects = 0; +  let maybeAtomic = 0;    let UseNamedOperandTable = 1;    let SchedRW = [WriteSMEM]; @@ -305,6 +306,10 @@ let SubtargetPredicate = HasScalarDwordx3Loads in  defm S_LOAD_DWORDX4  : SM_Pseudo_Loads <SReg_64, SReg_128>;  defm S_LOAD_DWORDX8  : SM_Pseudo_Loads <SReg_64, SReg_256>;  defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>; +defm S_LOAD_I8       : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_U8       : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_I16      : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_U16      : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;  let is_buffer = 1 in {  defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; @@ -316,6 +321,10 @@ let SubtargetPredicate = HasScalarDwordx3Loads in  defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>;  defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>;  defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>; +defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;  }  let SubtargetPredicate = HasScalarStores in { @@ -977,20 +986,35 @@ def : GCNPat <  }  } // let OtherPredicates = [HasShaderCyclesRegister] -multiclass SMPrefetchPat<string type, int cache_type> { +def i32imm_zero : TImmLeaf <i32, [{ +  return Imm == 0; +}]>; + +def i32imm_one : TImmLeaf <i32, [{ +  return Imm == 1; +}]>; + +multiclass SMPrefetchPat<string type, TImmLeaf cache_type> {    def : GCNPat < -    (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)), +    (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, cache_type),      (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))    >;    def : GCNPat < -    (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)), +    (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, cache_type),      (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0))    >; + +  def : GCNPat < +    (smrd_prefetch (i32 SReg_32:$sbase), timm, timm, cache_type), +    (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) +        (i64 (REG_SEQUENCE SReg_64, $sbase, sub0, (i32 (S_MOV_B32 (i32 0))), sub1)), +        0, (i32 SGPR_NULL), (i8 0)) +  >;  } -defm : SMPrefetchPat<"INST", 0>; -defm : SMPrefetchPat<"DATA", 1>; +defm : SMPrefetchPat<"INST", i32imm_zero>; +defm : SMPrefetchPat<"DATA", i32imm_one>;  //===----------------------------------------------------------------------===//  // GFX10. @@ -1321,6 +1345,11 @@ defm S_LOAD_B128 : SM_Real_Loads_gfx12<0x02, "S_LOAD_DWORDX4">;  defm S_LOAD_B256 : SM_Real_Loads_gfx12<0x03, "S_LOAD_DWORDX8">;  defm S_LOAD_B512 : SM_Real_Loads_gfx12<0x04, "S_LOAD_DWORDX16">; +defm S_LOAD_I8   : SM_Real_Loads_gfx12<0x08>; +defm S_LOAD_U8   : SM_Real_Loads_gfx12<0x09>; +defm S_LOAD_I16  : SM_Real_Loads_gfx12<0x0a>; +defm S_LOAD_U16  : SM_Real_Loads_gfx12<0x0b>; +  defm S_BUFFER_LOAD_B32  : SM_Real_Loads_gfx12<0x10, "S_BUFFER_LOAD_DWORD">;  defm S_BUFFER_LOAD_B64  : SM_Real_Loads_gfx12<0x11, "S_BUFFER_LOAD_DWORDX2">;  defm S_BUFFER_LOAD_B96  : SM_Real_Loads_gfx12<0x15, "S_BUFFER_LOAD_DWORDX3">; @@ -1328,6 +1357,11 @@ defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx12<0x12, "S_BUFFER_LOAD_DWORDX4">;  defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx12<0x13, "S_BUFFER_LOAD_DWORDX8">;  defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx12<0x14, "S_BUFFER_LOAD_DWORDX16">; +defm S_BUFFER_LOAD_I8  : SM_Real_Loads_gfx12<0x18>; +defm S_BUFFER_LOAD_U8  : SM_Real_Loads_gfx12<0x19>; +defm S_BUFFER_LOAD_I16 : SM_Real_Loads_gfx12<0x1a>; +defm S_BUFFER_LOAD_U16 : SM_Real_Loads_gfx12<0x1b>; +  def S_DCACHE_INV_gfx12 : SMEM_Real_gfx12<0x021, S_DCACHE_INV>;  def S_PREFETCH_INST_gfx12        : SMEM_Real_Prefetch_gfx12<0x24, S_PREFETCH_INST>; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td index c9687ac368d3..46fa3d57a21c 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -673,6 +673,16 @@ let SubtargetPredicate = isGFX12Plus in {      let isCommutable = 1;    } +  // The higher 32-bits of the inputs contain the sign extension bits. +  def S_MUL_I64_I32_PSEUDO : SPseudoInstSI < +    (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) +  >; + +  // The higher 32-bits of the inputs are zero. +  def S_MUL_U64_U32_PSEUDO : SPseudoInstSI < +    (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) +  >; +  } // End SubtargetPredicate = isGFX12Plus  let Uses = [SCC] in { @@ -1186,14 +1196,12 @@ let SubtargetPredicate = isGFX10Plus in {  let SubtargetPredicate = isGFX10GFX11 in {    def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">;    def S_SUBVECTOR_LOOP_END   : SOPK_32_BR<"s_subvector_loop_end">; -} // End SubtargetPredicate = isGFX10GFX11 -let SubtargetPredicate = isGFX10Plus in {    def S_WAITCNT_VSCNT   : SOPK_WAITCNT<"s_waitcnt_vscnt">;    def S_WAITCNT_VMCNT   : SOPK_WAITCNT<"s_waitcnt_vmcnt">;    def S_WAITCNT_EXPCNT  : SOPK_WAITCNT<"s_waitcnt_expcnt">;    def S_WAITCNT_LGKMCNT : SOPK_WAITCNT<"s_waitcnt_lgkmcnt">; -} // End SubtargetPredicate = isGFX10Plus +} // End SubtargetPredicate = isGFX10GFX11  //===----------------------------------------------------------------------===//  // SOPC Instructions @@ -1702,6 +1710,27 @@ let SubtargetPredicate = HasVGPRSingleUseHintInsts in {      SOPP_Pseudo<"s_singleuse_vdst", (ins s16imm:$simm16), "$simm16">;  } // End SubtargetPredicate = HasVGPRSingeUseHintInsts +let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in { +  def S_WAIT_LOADCNT : +    SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16">; +  def S_WAIT_LOADCNT_DSCNT : +    SOPP_Pseudo<"s_wait_loadcnt_dscnt", (ins s16imm:$simm16), "$simm16">; +  def S_WAIT_STORECNT : +    SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16">; +  def S_WAIT_STORECNT_DSCNT : +    SOPP_Pseudo<"s_wait_storecnt_dscnt", (ins s16imm:$simm16), "$simm16">; +  def S_WAIT_SAMPLECNT : +    SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16">; +  def S_WAIT_BVHCNT : +    SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">; +  def S_WAIT_EXPCNT : +    SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16">; +  def S_WAIT_DSCNT : +    SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16">; +  def S_WAIT_KMCNT : +    SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; +} // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 +  //===----------------------------------------------------------------------===//  // SOP1 Patterns  //===----------------------------------------------------------------------===// @@ -2411,10 +2440,10 @@ defm S_SETREG_IMM32_B32     : SOPK_Real64_gfx11_gfx12<0x013>;  defm S_CALL_B64             : SOPK_Real32_gfx11_gfx12<0x014>;  defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;  defm S_SUBVECTOR_LOOP_END   : SOPK_Real32_gfx11<0x017>; -defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx11_gfx12<0x018>; -defm S_WAITCNT_VMCNT        : SOPK_Real32_gfx11_gfx12<0x019>; -defm S_WAITCNT_EXPCNT       : SOPK_Real32_gfx11_gfx12<0x01a>; -defm S_WAITCNT_LGKMCNT      : SOPK_Real32_gfx11_gfx12<0x01b>; +defm S_WAITCNT_VSCNT        : SOPK_Real32_gfx11<0x018>; +defm S_WAITCNT_VMCNT        : SOPK_Real32_gfx11<0x019>; +defm S_WAITCNT_EXPCNT       : SOPK_Real32_gfx11<0x01a>; +defm S_WAITCNT_LGKMCNT      : SOPK_Real32_gfx11<0x01b>;  //===----------------------------------------------------------------------===//  // SOPK - GFX10. @@ -2516,6 +2545,15 @@ multiclass SOPP_Real_32_Renamed_gfx12<bits<7> op, SOPP_Pseudo backing_pseudo, st  defm S_WAIT_ALU             : SOPP_Real_32_Renamed_gfx12<0x008, S_WAITCNT_DEPCTR, "s_wait_alu">;  defm S_BARRIER_WAIT         : SOPP_Real_32_gfx12<0x014>;  defm S_BARRIER_LEAVE        : SOPP_Real_32_gfx12<0x015>; +defm S_WAIT_LOADCNT         : SOPP_Real_32_gfx12<0x040>; +defm S_WAIT_STORECNT        : SOPP_Real_32_gfx12<0x041>; +defm S_WAIT_SAMPLECNT       : SOPP_Real_32_gfx12<0x042>; +defm S_WAIT_BVHCNT          : SOPP_Real_32_gfx12<0x043>; +defm S_WAIT_EXPCNT          : SOPP_Real_32_gfx12<0x044>; +defm S_WAIT_DSCNT           : SOPP_Real_32_gfx12<0x046>; +defm S_WAIT_KMCNT           : SOPP_Real_32_gfx12<0x047>; +defm S_WAIT_LOADCNT_DSCNT   : SOPP_Real_32_gfx12<0x048>; +defm S_WAIT_STORECNT_DSCNT  : SOPP_Real_32_gfx12<0x049>;  //===----------------------------------------------------------------------===//  // SOPP - GFX11, GFX12. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a91d77175234..26ba2575ff34 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2506,53 +2506,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {           Val == 0x3118;   // 1/2pi  } -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { -  assert(HasInv2Pi); - -  if (isInt<16>(Literal) || isUInt<16>(Literal)) { -    int16_t Trunc = static_cast<int16_t>(Literal); -    return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi); +std::optional<unsigned> getInlineEncodingV216(bool IsFloat, uint32_t Literal) { +  // Unfortunately, the Instruction Set Architecture Reference Guide is +  // misleading about how the inline operands work for (packed) 16-bit +  // instructions. In a nutshell, the actual HW behavior is: +  // +  //  - integer encodings (-16 .. 64) are always produced as sign-extended +  //    32-bit values +  //  - float encodings are produced as: +  //    - for F16 instructions: corresponding half-precision float values in +  //      the LSBs, 0 in the MSBs +  //    - for UI16 instructions: corresponding single-precision float value +  int32_t Signed = static_cast<int32_t>(Literal); +  if (Signed >= 0 && Signed <= 64) +    return 128 + Signed; + +  if (Signed >= -16 && Signed <= -1) +    return 192 + std::abs(Signed); + +  if (IsFloat) { +    // clang-format off +    switch (Literal) { +    case 0x3800: return 240; // 0.5 +    case 0xB800: return 241; // -0.5 +    case 0x3C00: return 242; // 1.0 +    case 0xBC00: return 243; // -1.0 +    case 0x4000: return 244; // 2.0 +    case 0xC000: return 245; // -2.0 +    case 0x4400: return 246; // 4.0 +    case 0xC400: return 247; // -4.0 +    case 0x3118: return 248; // 1.0 / (2.0 * pi) +    default: break; +    } +    // clang-format on +  } else { +    // clang-format off +    switch (Literal) { +    case 0x3F000000: return 240; // 0.5 +    case 0xBF000000: return 241; // -0.5 +    case 0x3F800000: return 242; // 1.0 +    case 0xBF800000: return 243; // -1.0 +    case 0x40000000: return 244; // 2.0 +    case 0xC0000000: return 245; // -2.0 +    case 0x40800000: return 246; // 4.0 +    case 0xC0800000: return 247; // -4.0 +    case 0x3E22F983: return 248; // 1.0 / (2.0 * pi) +    default: break; +    } +    // clang-format on    } -  if (!(Literal & 0xffff)) -    return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi); -  int16_t Lo16 = static_cast<int16_t>(Literal); -  int16_t Hi16 = static_cast<int16_t>(Literal >> 16); -  return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); +  return {};  } -bool isInlinableIntLiteralV216(int32_t Literal) { -  int16_t Lo16 = static_cast<int16_t>(Literal); -  if (isInt<16>(Literal) || isUInt<16>(Literal)) -    return isInlinableIntLiteral(Lo16); +// Encoding of the literal as an inline constant for a V_PK_*_IU16 instruction +// or nullopt. +std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal) { +  return getInlineEncodingV216(false, Literal); +} -  int16_t Hi16 = static_cast<int16_t>(Literal >> 16); -  if (!(Literal & 0xffff)) -    return isInlinableIntLiteral(Hi16); -  return Lo16 == Hi16 && isInlinableIntLiteral(Lo16); +// Encoding of the literal as an inline constant for a V_PK_*_F16 instruction +// or nullopt. +std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) { +  return getInlineEncodingV216(true, Literal);  } -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) { +// Whether the given literal can be inlined for a V_PK_* instruction. +bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) {    switch (OpType) { +  case AMDGPU::OPERAND_REG_IMM_V2INT16: +  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: +  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: +    return getInlineEncodingV216(false, Literal).has_value();    case AMDGPU::OPERAND_REG_IMM_V2FP16:    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: -    return isInlinableLiteralV216(Literal, HasInv2Pi); +  case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: +    return getInlineEncodingV216(true, Literal).has_value();    default: -    return isInlinableIntLiteralV216(Literal); +    llvm_unreachable("bad packed operand type");    }  } -bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) { -  assert(HasInv2Pi); - -  int16_t Lo16 = static_cast<int16_t>(Literal); -  if (isInt<16>(Literal) || isUInt<16>(Literal)) -    return true; +// Whether the given literal can be inlined for a V_PK_*_IU16 instruction. +bool isInlinableLiteralV2I16(uint32_t Literal) { +  return getInlineEncodingV2I16(Literal).has_value(); +} -  int16_t Hi16 = static_cast<int16_t>(Literal >> 16); -  if (!(Literal & 0xffff)) -    return true; -  return Lo16 == Hi16; +// Whether the given literal can be inlined for a V_PK_*_F16 instruction. +bool isInlinableLiteralV2F16(uint32_t Literal) { +  return getInlineEncodingV2F16(Literal).has_value();  }  bool isValid32BitLiteral(uint64_t Val, bool IsFP64) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 3c9f330cbcde..50c741760d71 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1291,16 +1291,19 @@ LLVM_READNONE  bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);  LLVM_READNONE -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); +std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal);  LLVM_READNONE -bool isInlinableIntLiteralV216(int32_t Literal); +std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal);  LLVM_READNONE -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType); +bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType);  LLVM_READNONE -bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi); +bool isInlinableLiteralV2I16(uint32_t Literal); + +LLVM_READNONE +bool isInlinableLiteralV2F16(uint32_t Literal);  LLVM_READNONE  bool isValid32BitLiteral(uint64_t Val, bool IsFP64); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 0aa62ea77b11..ecee61daa1c8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1300,7 +1300,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,    let OtherPredicates = ps.OtherPredicates;  } -  +  class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen,                      VOPProfile p = ps.Pfl> :      VOP2_DPP8<op, ps, p> { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 7f52501b5d90..e9d6f67aee16 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -125,15 +125,6 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2  let SubtargetPredicate = HasVOP3PInsts in { -// Undo sub x, c -> add x, -c canonicalization since c is more likely -// an inline immediate than -c. -// The constant will be emitted as a mov, and folded later. -// TODO: We could directly encode the immediate now -def : GCNPat< -  (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1), -  (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) ->; -  // Integer operations with clamp bit set.  class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat<    (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), @@ -632,12 +623,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node,      // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.      let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {        def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), -                         !if(NoDstOverlap, null_frag, AgprMAIFrag<node>)>, +                         !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node>)>,                   MFMATable<0, NAME # "_e64">;        let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in        def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), -                                !if(NoDstOverlap, null_frag, VgprMAIFrag<node>)>, +                                !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node>)>,                          MFMATable<0, NAME # "_vgprcd_e64">;      } @@ -645,12 +636,13 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node,        let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),            isConvertibleToThreeAddress = NoDstOverlap,            Mnemonic = OpName in { -        def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), AgprMAIFrag<node>>, +        def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), +                                 !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node>)>,                           MFMATable<1, NAME # "_e64">;          let SubtargetPredicate = isGFX90APlus in          def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), -                                      VgprMAIFrag<node>>, +                                      !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node>)>,                                MFMATable<1, NAME # "_vgprcd_e64">;        }      } diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp index 28e35f8f2a54..17c2d7bb13b4 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp @@ -170,7 +170,7 @@ bool ARCDAGToDAGISel::SelectFrameADDR_ri(SDValue Addr, SDValue &Base,  void ARCDAGToDAGISel::Select(SDNode *N) {    switch (N->getOpcode()) {    case ISD::Constant: { -    uint64_t CVal = cast<ConstantSDNode>(N)->getZExtValue(); +    uint64_t CVal = N->getAsZExtVal();      ReplaceNode(N, CurDAG->getMachineNode(                         isInt<12>(CVal) ? ARC::MOV_rs12 : ARC::MOV_rlimm,                         SDLoc(N), MVT::i32, diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp index 2265f5db6737..5dd343d97b80 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCISelLowering.cpp @@ -174,6 +174,8 @@ ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM,    setOperationAction(ISD::READCYCLECOUNTER, MVT::i32, Legal);    setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,                       isTypeLegal(MVT::i64) ? Legal : Custom); + +  setMaxAtomicSizeInBitsSupported(0);  }  const char *ARCTargetLowering::getTargetNodeName(unsigned Opcode) const { diff --git a/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp index d4ae3255b32a..4f612ae623b9 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARC/ARCTargetMachine.cpp @@ -57,6 +57,7 @@ public:      return getTM<ARCTargetMachine>();    } +  void addIRPasses() override;    bool addInstSelector() override;    void addPreEmitPass() override;    void addPreRegAlloc() override; @@ -68,6 +69,12 @@ TargetPassConfig *ARCTargetMachine::createPassConfig(PassManagerBase &PM) {    return new ARCPassConfig(*this, PM);  } +void ARCPassConfig::addIRPasses() { +  addPass(createAtomicExpandPass()); + +  TargetPassConfig::addIRPasses(); +} +  bool ARCPassConfig::addInstSelector() {    addPass(createARCISelDag(getARCTargetMachine(), getOptLevel()));    return false; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp index 1d6aaeb7433b..cb3a709f7003 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -747,7 +747,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {            unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();            TmpOffset += SL->getElementOffset(Idx);          } else { -          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); +          uint64_t S = GTI.getSequentialElementStride(DL);            while (true) {              if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {                // Constant-offset addressing. diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index adc429b61bbc..e99ee299412a 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -372,7 +372,7 @@ INITIALIZE_PASS(ARMDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)  /// operand. If so Imm will receive the 32-bit value.  static bool isInt32Immediate(SDNode *N, unsigned &Imm) {    if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { -    Imm = cast<ConstantSDNode>(N)->getZExtValue(); +    Imm = N->getAsZExtVal();      return true;    }    return false; @@ -1101,8 +1101,7 @@ bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N,    if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {      Offset = N.getOperand(0);      SDValue N1 = N.getOperand(1); -    Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(), -                                      SDLoc(N), MVT::i32); +    Label = CurDAG->getTargetConstant(N1->getAsZExtVal(), SDLoc(N), MVT::i32);      return true;    } @@ -1942,7 +1941,7 @@ SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl,    if (!is64BitVector && NumVecs < 3)      NumRegs *= 2; -  unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); +  unsigned Alignment = Align->getAsZExtVal();    if (Alignment >= 32 && NumRegs == 4)      Alignment = 32;    else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4)) @@ -2428,7 +2427,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,    unsigned Alignment = 0;    if (NumVecs != 3) { -    Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); +    Alignment = Align->getAsZExtVal();      unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8;      if (Alignment > NumBytes)        Alignment = NumBytes; @@ -2871,7 +2870,7 @@ void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,      Ops.push_back(N->getOperand(OpIdx++));   // limit    SDValue ImmOp = N->getOperand(OpIdx++);    // step -  int ImmValue = cast<ConstantSDNode>(ImmOp)->getZExtValue(); +  int ImmValue = ImmOp->getAsZExtVal();    Ops.push_back(getI32Imm(ImmValue, Loc));    if (Predicated) @@ -2892,7 +2891,7 @@ void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode,    // Convert and append the immediate operand designating the coprocessor.    SDValue ImmCorpoc = N->getOperand(OpIdx++); -  uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue(); +  uint32_t ImmCoprocVal = ImmCorpoc->getAsZExtVal();    Ops.push_back(getI32Imm(ImmCoprocVal, Loc));    // For accumulating variants copy the low and high order parts of the @@ -2911,7 +2910,7 @@ void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode,    // Convert and append the immediate operand    SDValue Imm = N->getOperand(OpIdx); -  uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue(); +  uint32_t ImmVal = Imm->getAsZExtVal();    Ops.push_back(getI32Imm(ImmVal, Loc));    // Accumulating variants are IT-predicable, add predicate operands. @@ -2965,7 +2964,7 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,    unsigned Alignment = 0;    if (NumVecs != 3) { -    Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); +    Alignment = Align->getAsZExtVal();      unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8;      if (Alignment > NumBytes)        Alignment = NumBytes; @@ -3697,7 +3696,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {      // Other cases are autogenerated.      break;    case ISD::Constant: { -    unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); +    unsigned Val = N->getAsZExtVal();      // If we can't materialize the constant we need to use a literal pool      if (ConstantMaterializationCost(Val, Subtarget) > 2 &&          !Subtarget->genExecuteOnly()) { @@ -4132,7 +4131,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {      assert(N2.getOpcode() == ISD::Constant);      assert(N3.getOpcode() == ISD::Register); -    unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue(); +    unsigned CC = (unsigned)N2->getAsZExtVal();      if (InGlue.getOpcode() == ARMISD::CMPZ) {        if (InGlue.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) { @@ -4243,8 +4242,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {        if (SwitchEQNEToPLMI) {          SDValue ARMcc = N->getOperand(2); -        ARMCC::CondCodes CC = -          (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); +        ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();          switch (CC) {          default: llvm_unreachable("CMPZ must be either NE or EQ!"); diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9f3bcffc7a99..568085bd0ab3 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -4820,8 +4820,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,    // some tweaks to the heuristics for the previous and->shift transform.    // FIXME: Optimize cases where the LHS isn't a shift.    if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && -      isa<ConstantSDNode>(RHS) && -      cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && +      isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&        CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&        LHS.getConstantOperandVal(1) < 31) {      unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1; @@ -5533,7 +5532,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {      SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);      SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);      // Choose GE over PL, which vsel does now support -    if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) +    if (ARMcc->getAsZExtVal() == ARMCC::PL)        ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);      return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);    } @@ -7749,7 +7748,7 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,    uint64_t Val;    if (!isa<ConstantSDNode>(N))      return SDValue(); -  Val = cast<ConstantSDNode>(N)->getZExtValue(); +  Val = N->getAsZExtVal();    if (ST->isThumb1Only()) {      if (Val <= 255 || ~Val <= 255) @@ -7804,7 +7803,7 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG,      SDValue V = Op.getOperand(i);      if (!isa<ConstantSDNode>(V) && !V.isUndef())        continue; -    bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); +    bool BitSet = V.isUndef() ? false : V->getAsZExtVal();      if (BitSet)        Bits32 |= BoolMask << (i * BitsPerBool);    } @@ -9240,7 +9239,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG,    EVT VT = Op.getValueType();    EVT Op1VT = V1.getValueType();    unsigned NumElts = VT.getVectorNumElements(); -  unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); +  unsigned Index = V2->getAsZExtVal();    assert(VT.getScalarSizeInBits() == 1 &&           "Unexpected custom EXTRACT_SUBVECTOR lowering"); @@ -14618,7 +14617,7 @@ static SDValue PerformORCombineToBFI(SDNode *N,      // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask      // where lsb(mask) == #shamt and masked bits of B are known zero.      SDValue ShAmt = N00.getOperand(1); -    unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); +    unsigned ShAmtC = ShAmt->getAsZExtVal();      unsigned LSB = llvm::countr_zero(Mask);      if (ShAmtC != LSB)        return SDValue(); @@ -18339,8 +18338,7 @@ ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {    SDValue Chain = N->getOperand(0);    SDValue BB = N->getOperand(1);    SDValue ARMcc = N->getOperand(2); -  ARMCC::CondCodes CC = -    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); +  ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();    // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))    // -> (brcond Chain BB CC CPSR Cmp) @@ -18373,8 +18371,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {    SDValue FalseVal = N->getOperand(0);    SDValue TrueVal = N->getOperand(1);    SDValue ARMcc = N->getOperand(2); -  ARMCC::CondCodes CC = -    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); +  ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal();    // BFI is only available on V6T2+.    if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index 3ffde86ce1bb..abea0fef5cdc 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -362,8 +362,8 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate,    llvm_unreachable("Unsupported size for FCmp predicate");  } -bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, -                                      MachineInstr &MI) const { +bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, +                                      LostDebugLocObserver &LocObserver) const {    using namespace TargetOpcode;    MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; @@ -392,7 +392,8 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,                            OriginalResult};      auto Status = createLibcall(MIRBuilder, Libcall, {RetRegs, RetTy, 0},                                  {{MI.getOperand(1).getReg(), ArgTy, 0}, -                                 {MI.getOperand(2).getReg(), ArgTy, 0}}); +                                 {MI.getOperand(2).getReg(), ArgTy, 0}}, +                                LocObserver, &MI);      if (Status != LegalizerHelper::Legalized)        return false;      break; @@ -428,7 +429,8 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,        auto Status = createLibcall(MIRBuilder, Libcall.LibcallID,                                    {LibcallResult, RetTy, 0},                                    {{MI.getOperand(2).getReg(), ArgTy, 0}, -                                   {MI.getOperand(3).getReg(), ArgTy, 0}}); +                                   {MI.getOperand(3).getReg(), ArgTy, 0}}, +                                  LocObserver, &MI);        if (Status != LegalizerHelper::Legalized)          return false; diff --git a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h index f1c2e9c94336..d6ce4eb1055b 100644 --- a/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/ARM/ARMLegalizerInfo.h @@ -23,12 +23,12 @@ namespace llvm {  class ARMSubtarget; -/// This class provides the information for the target register banks.  class ARMLegalizerInfo : public LegalizerInfo {  public:    ARMLegalizerInfo(const ARMSubtarget &ST); -  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; +  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, +                      LostDebugLocObserver &LocObserver) const override;  private:    void setFCmpLibcallsGNU(); diff --git a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp index d36bfb188ed3..f91e77adb8f8 100644 --- a/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -660,7 +660,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS,    SDValue Cmp;    if (LHS.getSimpleValueType() == MVT::i16 && isa<ConstantSDNode>(RHS)) { -    uint64_t Imm = cast<ConstantSDNode>(RHS)->getZExtValue(); +    uint64_t Imm = RHS->getAsZExtVal();      // Generate a CPI/CPC pair if RHS is a 16-bit constant. Use the zero      // register for the constant RHS if its lower or higher byte is zero.      SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS, @@ -680,7 +680,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS,    } else if (RHS.getSimpleValueType() == MVT::i16 && isa<ConstantSDNode>(LHS)) {      // Generate a CPI/CPC pair if LHS is a 16-bit constant. Use the zero      // register for the constant LHS if its lower or higher byte is zero. -    uint64_t Imm = cast<ConstantSDNode>(LHS)->getZExtValue(); +    uint64_t Imm = LHS->getAsZExtVal();      SDValue LHSlo = (Imm & 0xff) == 0                          ? DAG.getRegister(Subtarget.getZeroRegister(), MVT::i8)                          : DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS, diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp index 2fe86e75ddae..4d8ace7c1ece 100644 --- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -151,6 +151,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,    }    setBooleanContents(ZeroOrOneBooleanContent); +  setMaxAtomicSizeInBitsSupported(64);    // Function alignments    setMinFunctionAlignment(Align(8)); diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp index ab0db576f7f7..8a6e7ae3663e 100644 --- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.cpp @@ -108,7 +108,8 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {    return new BPFPassConfig(*this, PM);  } -void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void BPFTargetMachine::registerPassBuilderCallbacks( +    PassBuilder &PB, bool PopulateClassToPassNames) {    PB.registerPipelineParsingCallback(        [](StringRef PassName, FunctionPassManager &FPM,           ArrayRef<PassBuilder::PipelineElement>) { @@ -148,7 +149,9 @@ void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {  }  void BPFPassConfig::addIRPasses() { +  addPass(createAtomicExpandPass());    addPass(createBPFCheckAndAdjustIR()); +    TargetPassConfig::addIRPasses();  } diff --git a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h index 4e6adc722e76..0a28394463b2 100644 --- a/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/BPF/BPFTargetMachine.h @@ -42,7 +42,8 @@ public:      return TLOF.get();    } -  void registerPassBuilderCallbacks(PassBuilder &PB) override; +  void registerPassBuilderCallbacks(PassBuilder &PB, +                                    bool PopulateClassToPassNames) override;  };  } diff --git a/contrib/llvm-project/llvm/lib/Target/DirectX/DXILResourceAnalysis.h b/contrib/llvm-project/llvm/lib/Target/DirectX/DXILResourceAnalysis.h index 8ffa1d7cd9b3..bce41160b95e 100644 --- a/contrib/llvm-project/llvm/lib/Target/DirectX/DXILResourceAnalysis.h +++ b/contrib/llvm-project/llvm/lib/Target/DirectX/DXILResourceAnalysis.h @@ -36,6 +36,7 @@ class DXILResourcePrinterPass : public PassInfoMixin<DXILResourcePrinterPass> {  public:    explicit DXILResourcePrinterPass(raw_ostream &OS) : OS(OS) {}    PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +  static bool isRequired() { return true; }  };  /// The legacy pass manager's analysis pass to compute DXIL resource diff --git a/contrib/llvm-project/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index d5cb488f2fde..06938f8c74f1 100644 --- a/contrib/llvm-project/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -100,7 +100,8 @@ DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT,  DirectXTargetMachine::~DirectXTargetMachine() {} -void DirectXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void DirectXTargetMachine::registerPassBuilderCallbacks( +    PassBuilder &PB, bool PopulateClassToPassNames) {    PB.registerPipelineParsingCallback(        [](StringRef PassName, ModulePassManager &PM,           ArrayRef<PassBuilder::PipelineElement>) { diff --git a/contrib/llvm-project/llvm/lib/Target/DirectX/DirectXTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/DirectX/DirectXTargetMachine.h index d04c375b2736..428beaf61cd0 100644 --- a/contrib/llvm-project/llvm/lib/Target/DirectX/DirectXTargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/DirectX/DirectXTargetMachine.h @@ -47,7 +47,8 @@ public:    }    TargetTransformInfo getTargetTransformInfo(const Function &F) const override; -  void registerPassBuilderCallbacks(PassBuilder &PB) override; +  void registerPassBuilderCallbacks(PassBuilder &PB, +                                    bool PopulateClassToPassNames) override;  };  } // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index eb5c59672224..defb1f7324f4 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -743,7 +743,7 @@ void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {  //  void HexagonDAGToDAGISel::SelectConstant(SDNode *N) {    if (N->getValueType(0) == MVT::i1) { -    assert(!(cast<ConstantSDNode>(N)->getZExtValue() >> 1)); +    assert(!(N->getAsZExtVal() >> 1));      unsigned Opc = (cast<ConstantSDNode>(N)->getSExtValue() != 0)                        ? Hexagon::PS_true                        : Hexagon::PS_false; diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 665e2d79c83d..81035849491b 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -1256,7 +1256,7 @@ HexagonTargetLowering::extractHvxSubvectorReg(SDValue OrigOp, SDValue VecV,        SDValue IdxV, const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {    MVT VecTy = ty(VecV);    unsigned HwLen = Subtarget.getVectorLength(); -  unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue(); +  unsigned Idx = IdxV.getNode()->getAsZExtVal();    MVT ElemTy = VecTy.getVectorElementType();    unsigned ElemWidth = ElemTy.getSizeInBits(); @@ -1299,7 +1299,7 @@ HexagonTargetLowering::extractHvxSubvectorPred(SDValue VecV, SDValue IdxV,    MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);    SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV);    // IdxV is required to be a constant. -  unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue(); +  unsigned Idx = IdxV.getNode()->getAsZExtVal();    unsigned ResLen = ResTy.getVectorNumElements();    unsigned BitBytes = HwLen / VecTy.getVectorNumElements(); @@ -1801,7 +1801,7 @@ HexagonTargetLowering::LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG)    MVT SrcTy = ty(SrcV);    MVT DstTy = ty(Op);    SDValue IdxV = Op.getOperand(1); -  unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue(); +  unsigned Idx = IdxV.getNode()->getAsZExtVal();    assert(Idx % DstTy.getVectorNumElements() == 0);    (void)Idx;    const SDLoc &dl(Op); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 590e464e1653..e7a692d67ba0 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -274,7 +274,8 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {    return I.get();  } -void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void HexagonTargetMachine::registerPassBuilderCallbacks( +    PassBuilder &PB, bool PopulateClassToPassNames) {    PB.registerLateLoopOptimizationsEPCallback(        [=](LoopPassManager &LPM, OptimizationLevel Level) {          LPM.addPass(HexagonLoopIdiomRecognitionPass()); diff --git a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h index dddd79ad1fcf..c5fed0cd65a8 100644 --- a/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/Hexagon/HexagonTargetMachine.h @@ -34,7 +34,8 @@ public:    ~HexagonTargetMachine() override;    const HexagonSubtarget *getSubtargetImpl(const Function &F) const override; -  void registerPassBuilderCallbacks(PassBuilder &PB) override; +  void registerPassBuilderCallbacks(PassBuilder &PB, +                                    bool PopulateClassToPassNames) override;    TargetPassConfig *createPassConfig(PassManagerBase &PM) override;    TargetTransformInfo getTargetTransformInfo(const Function &F) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp index 17d7ffb586f4..06de2ff1ae3e 100644 --- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiISelLowering.cpp @@ -166,6 +166,8 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM,    // Booleans always contain 0 or 1.    setBooleanContents(ZeroOrOneBooleanContent); + +  setMaxAtomicSizeInBitsSupported(0);  }  SDValue LanaiTargetLowering::LowerOperation(SDValue Op, diff --git a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp index 039182b3ffe6..33479720183b 100644 --- a/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -93,6 +93,7 @@ public:      return getTM<LanaiTargetMachine>();    } +  void addIRPasses() override;    bool addInstSelector() override;    void addPreSched2() override;    void addPreEmitPass() override; @@ -104,6 +105,12 @@ LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) {    return new LanaiPassConfig(*this, &PassManager);  } +void LanaiPassConfig::addIRPasses() { +  addPass(createAtomicExpandPass()); + +  TargetPassConfig::addIRPasses(); +} +  // Install an instruction selector pass.  bool LanaiPassConfig::addInstSelector() {    addPass(createLanaiISelDag(getLanaiTargetMachine())); diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index 66a37fce5dda..46f63a4103f9 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -121,6 +121,10 @@ class LoongArchAsmParser : public MCTargetAsmParser {    // Helper to emit pseudo instruction "li.w/d $rd, $imm".    void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out); +  // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym". +  void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, +                      bool IsTailCall); +  public:    enum LoongArchMatchResultTy {      Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY, @@ -400,6 +404,22 @@ public:                       IsValidKind;    } +  bool isSImm20pcaddu18i() const { +    if (!isImm()) +      return false; + +    int64_t Imm; +    LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None; +    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); +    bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None || +                       VK == LoongArchMCExpr::VK_LoongArch_CALL36; + +    return IsConstantImm +               ? isInt<20>(Imm) && IsValidKind +               : LoongArchAsmParser::classifySymbolRef(getImm(), VK) && +                     IsValidKind; +  } +    bool isSImm21lsl2() const {      if (!isImm())        return false; @@ -1110,6 +1130,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc,    }  } +void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc, +                                        MCStreamer &Out, bool IsTailCall) { +  // call36 sym +  // expands to: +  //   pcaddu18i $ra, %call36(sym) +  //   jirl      $ra, $ra, 0 +  // +  // tail36 $rj, sym +  // expands to: +  //   pcaddu18i $rj, %call36(sym) +  //   jirl      $r0, $rj, 0 +  unsigned ScratchReg = +      IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1; +  const MCExpr *Sym = +      IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr(); +  const LoongArchMCExpr *LE = LoongArchMCExpr::create( +      Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext()); + +  Out.emitInstruction( +      MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE), +      getSTI()); +  Out.emitInstruction( +      MCInstBuilder(LoongArch::JIRL) +          .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg) +          .addReg(ScratchReg) +          .addImm(0), +      getSTI()); +} +  bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,                                              OperandVector &Operands,                                              MCStreamer &Out) { @@ -1158,6 +1207,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,    case LoongArch::PseudoLI_D:      emitLoadImm(Inst, IDLoc, Out);      return false; +  case LoongArch::PseudoCALL36: +    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false); +    return false; +  case LoongArch::PseudoTAIL36: +    emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true); +    return false;    }    Out.emitInstruction(Inst, getSTI());    return false; @@ -1439,6 +1494,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,          /*Upper=*/(1 << 19) - 1,          "operand must be a symbol with modifier (e.g. %pc_hi20) or an integer "          "in the range"); +  case Match_InvalidSImm20pcaddu18i: +    return generateImmOutOfRangeError( +        Operands, ErrorInfo, /*Lower=*/-(1 << 19), +        /*Upper=*/(1 << 19) - 1, +        "operand must be a symbol with modifier (e.g. %call36) or an integer " +        "in the range");    case Match_InvalidSImm21lsl2:      return generateImmOutOfRangeError(          Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4, diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp index 72c1f1cec198..ad39658f698e 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp @@ -62,43 +62,24 @@ private:                                 MachineBasicBlock::iterator &NextMBBI,                                 unsigned FlagsHi, unsigned SecondOpcode,                                 unsigned FlagsLo); -  bool expandLargeAddressLoad(MachineBasicBlock &MBB, -                              MachineBasicBlock::iterator MBBI, -                              MachineBasicBlock::iterator &NextMBBI, -                              unsigned LastOpcode, unsigned IdentifyingMO); -  bool expandLargeAddressLoad(MachineBasicBlock &MBB, -                              MachineBasicBlock::iterator MBBI, -                              MachineBasicBlock::iterator &NextMBBI, -                              unsigned LastOpcode, unsigned IdentifyingMO, -                              const MachineOperand &Symbol, Register DestReg, -                              bool EraseFromParent);    bool expandLoadAddressPcrel(MachineBasicBlock &MBB,                                MachineBasicBlock::iterator MBBI, -                              MachineBasicBlock::iterator &NextMBBI, -                              bool Large = false); +                              MachineBasicBlock::iterator &NextMBBI);    bool expandLoadAddressGot(MachineBasicBlock &MBB,                              MachineBasicBlock::iterator MBBI, -                            MachineBasicBlock::iterator &NextMBBI, -                            bool Large = false); +                            MachineBasicBlock::iterator &NextMBBI);    bool expandLoadAddressTLSLE(MachineBasicBlock &MBB,                                MachineBasicBlock::iterator MBBI,                                MachineBasicBlock::iterator &NextMBBI);    bool expandLoadAddressTLSIE(MachineBasicBlock &MBB,                                MachineBasicBlock::iterator MBBI, -                              MachineBasicBlock::iterator &NextMBBI, -                              bool Large = false); +                              MachineBasicBlock::iterator &NextMBBI);    bool expandLoadAddressTLSLD(MachineBasicBlock &MBB,                                MachineBasicBlock::iterator MBBI, -                              MachineBasicBlock::iterator &NextMBBI, -                              bool Large = false); +                              MachineBasicBlock::iterator &NextMBBI);    bool expandLoadAddressTLSGD(MachineBasicBlock &MBB,                                MachineBasicBlock::iterator MBBI, -                              MachineBasicBlock::iterator &NextMBBI, -                              bool Large = false); -  bool expandFunctionCALL(MachineBasicBlock &MBB, -                          MachineBasicBlock::iterator MBBI, -                          MachineBasicBlock::iterator &NextMBBI, -                          bool IsTailCall); +                              MachineBasicBlock::iterator &NextMBBI);  };  char LoongArchPreRAExpandPseudo::ID = 0; @@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI(    switch (MBBI->getOpcode()) {    case LoongArch::PseudoLA_PCREL:      return expandLoadAddressPcrel(MBB, MBBI, NextMBBI); -  case LoongArch::PseudoLA_PCREL_LARGE: -    return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true);    case LoongArch::PseudoLA_GOT:      return expandLoadAddressGot(MBB, MBBI, NextMBBI); -  case LoongArch::PseudoLA_GOT_LARGE: -    return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true);    case LoongArch::PseudoLA_TLS_LE:      return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI);    case LoongArch::PseudoLA_TLS_IE:      return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI); -  case LoongArch::PseudoLA_TLS_IE_LARGE: -    return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true);    case LoongArch::PseudoLA_TLS_LD:      return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI); -  case LoongArch::PseudoLA_TLS_LD_LARGE: -    return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true);    case LoongArch::PseudoLA_TLS_GD:      return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI); -  case LoongArch::PseudoLA_TLS_GD_LARGE: -    return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true); -  case LoongArch::PseudoCALL: -    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); -  case LoongArch::PseudoTAIL: -    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);    }    return false;  } @@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair(    return true;  } -bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad( -    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, -    unsigned IdentifyingMO) { -  MachineInstr &MI = *MBBI; -  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO, -                                MI.getOperand(2), MI.getOperand(0).getReg(), -                                true); -} - -bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad( -    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, -    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg, -    bool EraseFromParent) { -  // Code Sequence: -  // -  // Part1: pcalau12i  $scratch, %MO1(sym) -  // Part0: addi.d     $dest, $zero, %MO0(sym) -  // Part2: lu32i.d    $dest, %MO2(sym) -  // Part3: lu52i.d    $dest, $dest, %MO3(sym) -  // Fin:   LastOpcode $dest, $dest, $scratch - -  unsigned MO0, MO1, MO2, MO3; -  switch (IdentifyingMO) { -  default: -    llvm_unreachable("unsupported identifying MO"); -  case LoongArchII::MO_PCREL_LO: -    MO0 = IdentifyingMO; -    MO1 = LoongArchII::MO_PCREL_HI; -    MO2 = LoongArchII::MO_PCREL64_LO; -    MO3 = LoongArchII::MO_PCREL64_HI; -    break; -  case LoongArchII::MO_GOT_PC_HI: -  case LoongArchII::MO_LD_PC_HI: -  case LoongArchII::MO_GD_PC_HI: -    // These cases relocate just like the GOT case, except for Part1. -    MO0 = LoongArchII::MO_GOT_PC_LO; -    MO1 = IdentifyingMO; -    MO2 = LoongArchII::MO_GOT_PC64_LO; -    MO3 = LoongArchII::MO_GOT_PC64_HI; -    break; -  case LoongArchII::MO_IE_PC_LO: -    MO0 = IdentifyingMO; -    MO1 = LoongArchII::MO_IE_PC_HI; -    MO2 = LoongArchII::MO_IE_PC64_LO; -    MO3 = LoongArchII::MO_IE_PC64_HI; -    break; -  } - -  MachineFunction *MF = MBB.getParent(); -  MachineInstr &MI = *MBBI; -  DebugLoc DL = MI.getDebugLoc(); - -  assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() && -         "Large code model requires LA64"); - -  Register TmpPart1 = -      MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass); -  Register TmpPart0 = -      DestReg.isVirtual() -          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) -          : DestReg; -  Register TmpParts02 = -      DestReg.isVirtual() -          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) -          : DestReg; -  Register TmpParts023 = -      DestReg.isVirtual() -          ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) -          : DestReg; - -  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1); -  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0) -                   .addReg(LoongArch::R0); -  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02) -                   // "rj" is needed due to InstrInfo pattern requirement. -                   .addReg(TmpPart0, RegState::Kill); -  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023) -                   .addReg(TmpParts02, RegState::Kill); -  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg) -      .addReg(TmpParts023) -      .addReg(TmpPart1, RegState::Kill); - -  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) { -    const char *SymName = Symbol.getSymbolName(); -    Part0.addExternalSymbol(SymName, MO0); -    Part1.addExternalSymbol(SymName, MO1); -    Part2.addExternalSymbol(SymName, MO2); -    Part3.addExternalSymbol(SymName, MO3); -  } else { -    Part0.addDisp(Symbol, 0, MO0); -    Part1.addDisp(Symbol, 0, MO1); -    Part2.addDisp(Symbol, 0, MO2); -    Part3.addDisp(Symbol, 0, MO3); -  } - -  if (EraseFromParent) -    MI.eraseFromParent(); - -  return true; -} -  bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -    MachineBasicBlock::iterator &NextMBBI, bool Large) { -  if (Large) -    // Emit the 5-insn large address load sequence with the `%pc` family of -    // relocs. -    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, -                                  LoongArchII::MO_PCREL_LO); - +    MachineBasicBlock::iterator &NextMBBI) {    // Code Sequence:    // pcalau12i $rd, %pc_hi20(sym)    // addi.w/d $rd, $rd, %pc_lo12(sym) @@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel(  bool LoongArchPreRAExpandPseudo::expandLoadAddressGot(      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -    MachineBasicBlock::iterator &NextMBBI, bool Large) { -  if (Large) -    // Emit the 5-insn large address load sequence with the `%got_pc` family -    // of relocs, loading the result from GOT with `ldx.d` in the end. -    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, -                                  LoongArchII::MO_GOT_PC_HI); - +    MachineBasicBlock::iterator &NextMBBI) {    // Code Sequence:    // pcalau12i $rd, %got_pc_hi20(sym)    // ld.w/d $rd, $rd, %got_pc_lo12(sym) @@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE(  bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -    MachineBasicBlock::iterator &NextMBBI, bool Large) { -  if (Large) -    // Emit the 5-insn large address load sequence with the `%ie_pc` family -    // of relocs, loading the result with `ldx.d` in the end. -    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, -                                  LoongArchII::MO_IE_PC_LO); - +    MachineBasicBlock::iterator &NextMBBI) {    // Code Sequence:    // pcalau12i $rd, %ie_pc_hi20(sym)    // ld.w/d $rd, $rd, %ie_pc_lo12(sym) @@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE(  bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -    MachineBasicBlock::iterator &NextMBBI, bool Large) { -  if (Large) -    // Emit the 5-insn large address load sequence with the `%got_pc` family -    // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`. -    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, -                                  LoongArchII::MO_LD_PC_HI); - +    MachineBasicBlock::iterator &NextMBBI) {    // Code Sequence:    // pcalau12i $rd, %ld_pc_hi20(sym)    // addi.w/d $rd, $rd, %got_pc_lo12(sym) @@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD(  bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -    MachineBasicBlock::iterator &NextMBBI, bool Large) { -  if (Large) -    // Emit the 5-insn large address load sequence with the `%got_pc` family -    // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`. -    return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, -                                  LoongArchII::MO_GD_PC_HI); - +    MachineBasicBlock::iterator &NextMBBI) {    // Code Sequence:    // pcalau12i $rd, %gd_pc_hi20(sym)    // addi.w/d $rd, $rd, %got_pc_lo12(sym) @@ -433,88 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD(                                   SecondOpcode, LoongArchII::MO_GOT_PC_LO);  } -bool LoongArchPreRAExpandPseudo::expandFunctionCALL( -    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, -    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) { -  MachineFunction *MF = MBB.getParent(); -  MachineInstr &MI = *MBBI; -  DebugLoc DL = MI.getDebugLoc(); -  const MachineOperand &Func = MI.getOperand(0); -  MachineInstrBuilder CALL; -  unsigned Opcode; - -  switch (MF->getTarget().getCodeModel()) { -  default: -    report_fatal_error("Unsupported code model"); -    break; -  case CodeModel::Small: { -    // CALL: -    // bl func -    // TAIL: -    // b func -    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL; -    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func); -    break; -  } -  case CodeModel::Medium: { -    // CALL: -    // pcalau12i  $ra, %pc_hi20(func) -    // jirl       $ra, $ra, %pc_lo12(func) -    // TAIL: -    // pcalau12i  $scratch, %pc_hi20(func) -    // jirl       $r0, $scratch, %pc_lo12(func) -    Opcode = -        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; -    Register ScratchReg = -        IsTailCall -            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) -            : LoongArch::R1; -    MachineInstrBuilder MIB = -        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg); -    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg); -    if (Func.isSymbol()) { -      const char *FnName = Func.getSymbolName(); -      MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI); -      CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO); -      break; -    } -    assert(Func.isGlobal() && "Expected a GlobalValue at this time"); -    const GlobalValue *GV = Func.getGlobal(); -    MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI); -    CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO); -    break; -  } -  case CodeModel::Large: { -    // Emit the 5-insn large address load sequence, either directly or -    // indirectly in case of going through the GOT, then JIRL_TAIL or -    // JIRL_CALL to $addr. -    Opcode = -        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; -    Register AddrReg = -        IsTailCall -            ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) -            : LoongArch::R1; - -    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal(); -    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO; -    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D; -    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg, -                           false); -    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0); -    break; -  } -  } - -  // Transfer implicit operands. -  CALL.copyImplicitOps(MI); - -  // Transfer MI flags. -  CALL.setMIFlags(MI.getFlags()); - -  MI.eraseFromParent(); -  return true; -} -  class LoongArchExpandPseudo : public MachineFunctionPass {  public:    const LoongArchInstrInfo *TII; @@ -536,6 +288,35 @@ private:                  MachineBasicBlock::iterator &NextMBBI);    bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,                       MachineBasicBlock::iterator &NextMBBI); +  bool expandLargeAddressLoad(MachineBasicBlock &MBB, +                              MachineBasicBlock::iterator MBBI, +                              MachineBasicBlock::iterator &NextMBBI, +                              unsigned LastOpcode, unsigned IdentifyingMO); +  bool expandLargeAddressLoad(MachineBasicBlock &MBB, +                              MachineBasicBlock::iterator MBBI, +                              MachineBasicBlock::iterator &NextMBBI, +                              unsigned LastOpcode, unsigned IdentifyingMO, +                              const MachineOperand &Symbol, Register DestReg, +                              bool EraseFromParent); +  bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB, +                                   MachineBasicBlock::iterator MBBI, +                                   MachineBasicBlock::iterator &NextMBBI); +  bool expandLoadAddressGotLarge(MachineBasicBlock &MBB, +                                 MachineBasicBlock::iterator MBBI, +                                 MachineBasicBlock::iterator &NextMBBI); +  bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB, +                                   MachineBasicBlock::iterator MBBI, +                                   MachineBasicBlock::iterator &NextMBBI); +  bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB, +                                   MachineBasicBlock::iterator MBBI, +                                   MachineBasicBlock::iterator &NextMBBI); +  bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB, +                                   MachineBasicBlock::iterator MBBI, +                                   MachineBasicBlock::iterator &NextMBBI); +  bool expandFunctionCALL(MachineBasicBlock &MBB, +                          MachineBasicBlock::iterator MBBI, +                          MachineBasicBlock::iterator &NextMBBI, +                          bool IsTailCall);  };  char LoongArchExpandPseudo::ID = 0; @@ -570,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB,    switch (MBBI->getOpcode()) {    case LoongArch::PseudoCopyCFR:      return expandCopyCFR(MBB, MBBI, NextMBBI); +  case LoongArch::PseudoLA_PCREL_LARGE: +    return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI); +  case LoongArch::PseudoLA_GOT_LARGE: +    return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI); +  case LoongArch::PseudoLA_TLS_IE_LARGE: +    return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI); +  case LoongArch::PseudoLA_TLS_LD_LARGE: +    return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI); +  case LoongArch::PseudoLA_TLS_GD_LARGE: +    return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI); +  case LoongArch::PseudoCALL: +  case LoongArch::PseudoCALL_MEDIUM: +  case LoongArch::PseudoCALL_LARGE: +    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); +  case LoongArch::PseudoTAIL: +  case LoongArch::PseudoTAIL_MEDIUM: +  case LoongArch::PseudoTAIL_LARGE: +    return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);    }    return false; @@ -628,6 +427,212 @@ bool LoongArchExpandPseudo::expandCopyCFR(    return true;  } +bool LoongArchExpandPseudo::expandLargeAddressLoad( +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, +    unsigned IdentifyingMO) { +  MachineInstr &MI = *MBBI; +  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO, +                                MI.getOperand(2), MI.getOperand(0).getReg(), +                                true); +} + +bool LoongArchExpandPseudo::expandLargeAddressLoad( +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +    MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, +    unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg, +    bool EraseFromParent) { +  // Code Sequence: +  // +  // Part1: pcalau12i  $dst, %MO1(sym) +  // Part0: addi.d     $t8, $zero, %MO0(sym) +  // Part2: lu32i.d    $t8, %MO2(sym) +  // Part3: lu52i.d    $t8, $t8, %MO3(sym) +  // Fin:   LastOpcode $dst, $t8, $dst + +  unsigned MO0, MO1, MO2, MO3; +  switch (IdentifyingMO) { +  default: +    llvm_unreachable("unsupported identifying MO"); +  case LoongArchII::MO_PCREL_LO: +    MO0 = IdentifyingMO; +    MO1 = LoongArchII::MO_PCREL_HI; +    MO2 = LoongArchII::MO_PCREL64_LO; +    MO3 = LoongArchII::MO_PCREL64_HI; +    break; +  case LoongArchII::MO_GOT_PC_HI: +  case LoongArchII::MO_LD_PC_HI: +  case LoongArchII::MO_GD_PC_HI: +    // These cases relocate just like the GOT case, except for Part1. +    MO0 = LoongArchII::MO_GOT_PC_LO; +    MO1 = IdentifyingMO; +    MO2 = LoongArchII::MO_GOT_PC64_LO; +    MO3 = LoongArchII::MO_GOT_PC64_HI; +    break; +  case LoongArchII::MO_IE_PC_LO: +    MO0 = IdentifyingMO; +    MO1 = LoongArchII::MO_IE_PC_HI; +    MO2 = LoongArchII::MO_IE_PC64_LO; +    MO3 = LoongArchII::MO_IE_PC64_HI; +    break; +  } + +  MachineInstr &MI = *MBBI; +  DebugLoc DL = MI.getDebugLoc(); +  Register ScratchReg = LoongArch::R20; // $t8 + +  assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() && +         "Large code model requires LA64"); + +  auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg); +  auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg) +                   .addReg(LoongArch::R0); +  auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg) +                   // "rj" is needed due to InstrInfo pattern requirement. +                   .addReg(ScratchReg); +  auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg) +                   .addReg(ScratchReg); +  BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg) +      .addReg(ScratchReg) +      .addReg(DestReg); + +  if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) { +    const char *SymName = Symbol.getSymbolName(); +    Part0.addExternalSymbol(SymName, MO0); +    Part1.addExternalSymbol(SymName, MO1); +    Part2.addExternalSymbol(SymName, MO2); +    Part3.addExternalSymbol(SymName, MO3); +  } else { +    Part0.addDisp(Symbol, 0, MO0); +    Part1.addDisp(Symbol, 0, MO1); +    Part2.addDisp(Symbol, 0, MO2); +    Part3.addDisp(Symbol, 0, MO3); +  } + +  if (EraseFromParent) +    MI.eraseFromParent(); + +  return true; +} + +bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge( +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +    MachineBasicBlock::iterator &NextMBBI) { +  // Emit the 5-insn large address load sequence with the `%pc` family of +  // relocs. +  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, +                                LoongArchII::MO_PCREL_LO); +} + +bool LoongArchExpandPseudo::expandLoadAddressGotLarge( +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +    MachineBasicBlock::iterator &NextMBBI) { +  // Emit the 5-insn large address load sequence with the `%got_pc` family +  // of relocs, loading the result from GOT with `ldx.d` in the end. +  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, +                                LoongArchII::MO_GOT_PC_HI); +} + +bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge( +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +    MachineBasicBlock::iterator &NextMBBI) { +  // Emit the 5-insn large address load sequence with the `%ie_pc` family +  // of relocs, loading the result with `ldx.d` in the end. +  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, +                                LoongArchII::MO_IE_PC_LO); +} + +bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge( +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +    MachineBasicBlock::iterator &NextMBBI) { +  // Emit the 5-insn large address load sequence with the `%got_pc` family +  // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`. +  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, +                                LoongArchII::MO_LD_PC_HI); +} + +bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge( +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +    MachineBasicBlock::iterator &NextMBBI) { +  // Emit the 5-insn large address load sequence with the `%got_pc` family +  // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`. +  return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, +                                LoongArchII::MO_GD_PC_HI); +} + +bool LoongArchExpandPseudo::expandFunctionCALL( +    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, +    MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) { +  MachineFunction *MF = MBB.getParent(); +  MachineInstr &MI = *MBBI; +  DebugLoc DL = MI.getDebugLoc(); +  const MachineOperand &Func = MI.getOperand(0); +  MachineInstrBuilder CALL; +  unsigned Opcode; + +  switch (MF->getTarget().getCodeModel()) { +  default: +    report_fatal_error("Unsupported code model"); +    break; +  case CodeModel::Small: { +    // CALL: +    // bl func +    // TAIL: +    // b func +    Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL; +    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func); +    break; +  } +  case CodeModel::Medium: { +    // CALL: +    // pcaddu18i  $ra, %call36(func) +    // jirl       $ra, $ra, 0 +    // TAIL: +    // pcaddu18i  $t8, %call36(func) +    // jr         $t8 +    Opcode = +        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; +    Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1; +    MachineInstrBuilder MIB = +        BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg); + +    CALL = +        BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0); + +    if (Func.isSymbol()) +      MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36); +    else +      MIB.addDisp(Func, 0, LoongArchII::MO_CALL36); +    break; +  } +  case CodeModel::Large: { +    // Emit the 5-insn large address load sequence, either directly or +    // indirectly in case of going through the GOT, then JIRL_TAIL or +    // JIRL_CALL to $addr. +    Opcode = +        IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; +    Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1; + +    bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal(); +    unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO; +    unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D; +    expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg, +                           false); +    CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0); +    break; +  } +  } + +  // Transfer implicit operands. +  CALL.copyImplicitOps(MI); + +  // Transfer MI flags. +  CALL.setMIFlags(MI.getFlags()); + +  MI.eraseFromParent(); +  return true; +} +  } // end namespace  INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo", diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index e14bbadf9ed2..70f782b81270 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -525,8 +525,7 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,    if (isa<ConstantSDNode>(Idx) &&        (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 || -       EltTy == MVT::f64 || -       cast<ConstantSDNode>(Idx)->getZExtValue() < NumElts / 2)) +       EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2))      return Op;    return SDValue(); @@ -762,12 +761,13 @@ static SDValue getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty,  template <class NodeTy>  SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, +                                         CodeModel::Model M,                                           bool IsLocal) const {    SDLoc DL(N);    EVT Ty = getPointerTy(DAG.getDataLayout());    SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); -  switch (DAG.getTarget().getCodeModel()) { +  switch (M) {    default:      report_fatal_error("Unsupported code model"); @@ -808,24 +808,35 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,  SDValue LoongArchTargetLowering::lowerBlockAddress(SDValue Op,                                                     SelectionDAG &DAG) const { -  return getAddr(cast<BlockAddressSDNode>(Op), DAG); +  return getAddr(cast<BlockAddressSDNode>(Op), DAG, +                 DAG.getTarget().getCodeModel());  }  SDValue LoongArchTargetLowering::lowerJumpTable(SDValue Op,                                                  SelectionDAG &DAG) const { -  return getAddr(cast<JumpTableSDNode>(Op), DAG); +  return getAddr(cast<JumpTableSDNode>(Op), DAG, +                 DAG.getTarget().getCodeModel());  }  SDValue LoongArchTargetLowering::lowerConstantPool(SDValue Op,                                                     SelectionDAG &DAG) const { -  return getAddr(cast<ConstantPoolSDNode>(Op), DAG); +  return getAddr(cast<ConstantPoolSDNode>(Op), DAG, +                 DAG.getTarget().getCodeModel());  }  SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op,                                                      SelectionDAG &DAG) const {    GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);    assert(N->getOffset() == 0 && "unexpected offset in global node"); -  return getAddr(N, DAG, N->getGlobal()->isDSOLocal()); +  auto CM = DAG.getTarget().getCodeModel(); +  const GlobalValue *GV = N->getGlobal(); + +  if (GV->isDSOLocal() && isa<GlobalVariable>(GV)) { +    if (auto GCM = dyn_cast<GlobalVariable>(GV)->getCodeModel()) +      CM = *GCM; +  } + +  return getAddr(N, DAG, CM, GV->isDSOLocal());  }  SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N, @@ -1383,28 +1394,28 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,      if (IntrinsicEnum == Intrinsic::loongarch_cacop_w && Subtarget.is64Bit())        return emitIntrinsicErrorMessage(Op, ErrorMsgReqLA32, DAG);      // call void @llvm.loongarch.cacop.[d/w](uimm5, rj, simm12) -    unsigned Imm1 = cast<ConstantSDNode>(Op2)->getZExtValue(); +    unsigned Imm1 = Op2->getAsZExtVal();      int Imm2 = cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue();      if (!isUInt<5>(Imm1) || !isInt<12>(Imm2))        return emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG);      return Op;    }    case Intrinsic::loongarch_dbar: { -    unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +    unsigned Imm = Op2->getAsZExtVal();      return !isUInt<15>(Imm)                 ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)                 : DAG.getNode(LoongArchISD::DBAR, DL, MVT::Other, Chain,                               DAG.getConstant(Imm, DL, GRLenVT));    }    case Intrinsic::loongarch_ibar: { -    unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +    unsigned Imm = Op2->getAsZExtVal();      return !isUInt<15>(Imm)                 ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)                 : DAG.getNode(LoongArchISD::IBAR, DL, MVT::Other, Chain,                               DAG.getConstant(Imm, DL, GRLenVT));    }    case Intrinsic::loongarch_break: { -    unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +    unsigned Imm = Op2->getAsZExtVal();      return !isUInt<15>(Imm)                 ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)                 : DAG.getNode(LoongArchISD::BREAK, DL, MVT::Other, Chain, @@ -1413,7 +1424,7 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,    case Intrinsic::loongarch_movgr2fcsr: {      if (!Subtarget.hasBasicF())        return emitIntrinsicErrorMessage(Op, ErrorMsgReqF, DAG); -    unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +    unsigned Imm = Op2->getAsZExtVal();      return !isUInt<2>(Imm)                 ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)                 : DAG.getNode(LoongArchISD::MOVGR2FCSR, DL, MVT::Other, Chain, @@ -1422,7 +1433,7 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op,                                           Op.getOperand(3)));    }    case Intrinsic::loongarch_syscall: { -    unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +    unsigned Imm = Op2->getAsZExtVal();      return !isUInt<15>(Imm)                 ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG)                 : DAG.getNode(LoongArchISD::SYSCALL, DL, MVT::Other, Chain, @@ -1925,7 +1936,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(          emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgReqF);          return;        } -      unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +      unsigned Imm = Op2->getAsZExtVal();        if (!isUInt<2>(Imm)) {          emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);          return; @@ -1981,7 +1992,7 @@ void LoongArchTargetLowering::ReplaceNodeResults(        CSR_CASE(iocsrrd_d);  #undef CSR_CASE      case Intrinsic::loongarch_csrrd_w: { -      unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +      unsigned Imm = Op2->getAsZExtVal();        if (!isUInt<14>(Imm)) {          emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR);          return; @@ -3381,8 +3392,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {      // TODO: Add more target-dependent nodes later.      NODE_NAME_CASE(CALL) +    NODE_NAME_CASE(CALL_MEDIUM) +    NODE_NAME_CASE(CALL_LARGE)      NODE_NAME_CASE(RET)      NODE_NAME_CASE(TAIL) +    NODE_NAME_CASE(TAIL_MEDIUM) +    NODE_NAME_CASE(TAIL_LARGE)      NODE_NAME_CASE(SLL_W)      NODE_NAME_CASE(SRA_W)      NODE_NAME_CASE(SRL_W) @@ -4240,15 +4255,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,    // Emit the call.    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); +  unsigned Op; +  switch (DAG.getTarget().getCodeModel()) { +  default: +    report_fatal_error("Unsupported code model"); +  case CodeModel::Small: +    Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL; +    break; +  case CodeModel::Medium: +    assert(Subtarget.is64Bit() && "Medium code model requires LA64"); +    Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM; +    break; +  case CodeModel::Large: +    assert(Subtarget.is64Bit() && "Large code model requires LA64"); +    Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE; +    break; +  }    if (IsTailCall) {      MF.getFrameInfo().setHasTailCall(); -    SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops); +    SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops);      DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);      return Ret;    } -  Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops); +  Chain = DAG.getNode(Op, DL, NodeTys, Ops);    DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);    Glue = Chain.getValue(1); diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 6f8878f9ccd5..72182623b2c3 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -27,8 +27,12 @@ enum NodeType : unsigned {    // TODO: add more LoongArchISDs    CALL, +  CALL_MEDIUM, +  CALL_LARGE,    RET,    TAIL, +  TAIL_MEDIUM, +  TAIL_LARGE,    // 32-bit shifts, directly matching the semantics of the named LoongArch    // instructions. @@ -250,7 +254,8 @@ private:                           LoongArchCCAssignFn Fn) const;    template <class NodeTy> -  SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const; +  SDValue getAddr(NodeTy *N, SelectionDAG &DAG, CodeModel::Model M, +                  bool IsLocal = true) const;    SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,                             unsigned Opc, bool Large = false) const;    SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 2fea0f33e9eb..78074c012876 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone,  def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall,                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,                               SDNPVariadic]>; +def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall, +                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, +                                    SDNPVariadic]>; +def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall, +                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, +                                    SDNPVariadic]>; +def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall, +                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, +                                   SDNPVariadic]>; +def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall, +                                  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, +                                   SDNPVariadic]>;  def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;  def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;  def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; @@ -377,6 +389,10 @@ def simm20_lu32id : SImm20Operand {    let ParserMatchClass = SImmAsmOperand<20, "lu32id">;  } +def simm20_pcaddu18i : SImm20Operand { +  let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">; +} +  def simm21_lsl2 : Operand<OtherVT> {    let ParserMatchClass = SImmAsmOperand<21, "lsl2">;    let EncoderMethod = "getImmOpValueAsr<2>"; @@ -832,7 +848,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst),                         "$rd, $imm20">;  }  def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>; -def PCADDU18I : ALU_1RI20<0x1e000000, simm20>; +def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>;  def MUL_D     : ALU_3R<0x001d8000>;  def MULH_D    : ALU_3R<0x001e0000>;  def MULH_DU   : ALU_3R<0x001e8000>; @@ -1395,16 +1411,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>;  def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)),            (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>; +// Function call with 'Small' code model.  let isCall = 1, Defs = [R1] in -def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>; +def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>;  def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>;  def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; +// Function call with 'Medium' code model. +let isCall = 1, Defs = [R1, R20], Size = 8 in +def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>; + +let Predicates = [IsLA64] in { +def : Pat<(loongarch_call_medium tglobaladdr:$func), +          (PseudoCALL_MEDIUM tglobaladdr:$func)>; +def : Pat<(loongarch_call_medium texternalsym:$func), +          (PseudoCALL_MEDIUM texternalsym:$func)>; +} // Predicates = [IsLA64] + +// Function call with 'Large' code model. +let isCall = 1, Defs = [R1, R20], Size = 24 in +def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>; + +let Predicates = [IsLA64] in { +def : Pat<(loongarch_call_large tglobaladdr:$func), +          (PseudoCALL_LARGE tglobaladdr:$func)>; +def : Pat<(loongarch_call_large texternalsym:$func), +          (PseudoCALL_LARGE texternalsym:$func)>; +} // Predicates = [IsLA64] +  let isCall = 1, Defs = [R1] in  def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj),                                  [(loongarch_call GPR:$rj)]>,                           PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>; +let Predicates = [IsLA64] in { +def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>; +def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>; +}  let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = [R1] in  def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>, @@ -1415,18 +1458,47 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in  def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>,                  PseudoInstExpansion<(JIRL R0, R1, 0)>; +// Tail call with 'Small' code model.  let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in -def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>; +def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>;  def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)),            (PseudoTAIL tglobaladdr:$dst)>;  def : Pat<(loongarch_tail (iPTR texternalsym:$dst)),            (PseudoTAIL texternalsym:$dst)>; +// Tail call with 'Medium' code model. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, +    Uses = [R3], Defs = [R20], Size = 8 in +def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>; + +let Predicates = [IsLA64] in { +def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)), +          (PseudoTAIL_MEDIUM tglobaladdr:$dst)>; +def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)), +          (PseudoTAIL_MEDIUM texternalsym:$dst)>; +} // Predicates = [IsLA64] + +// Tail call with 'Large' code model. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, +    Uses = [R3], Defs = [R19, R20], Size = 24 in +def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>; + +let Predicates = [IsLA64] in { +def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)), +          (PseudoTAIL_LARGE tglobaladdr:$dst)>; +def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)), +          (PseudoTAIL_LARGE texternalsym:$dst)>; +} // Predicates = [IsLA64] +  let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in  def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj),                                  [(loongarch_tail GPRT:$rj)]>,                           PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>; +let Predicates = [IsLA64] in { +def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>; +def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>; +}  let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,      hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = [R3] in @@ -1439,6 +1511,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>,                        PseudoInstExpansion<(JIRL R0, GPR:$rj,                                             simm16_lsl2:$imm16)>; +/// call36/taill36 macro instructions +let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1, +    Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in +def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [], +                          "call36", "$dst">, +                   Requires<[IsLA64]>; +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3], +    isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0, +    mayStore = 0, mayLoad = 0 in +def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [], +                          "tail36", "$tmp, $dst">, +                   Requires<[IsLA64]>; +  /// Load address (la*) macro instructions.  // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser. @@ -1451,6 +1536,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst),                                  "la.abs", "$dst, $src">;  def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],                              "la.pcrel", "$dst, $src">; +let Defs = [R20], Size = 20 in  def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst),                                    (ins GPR:$tmp, bare_symbol:$src), [],                                    "la.pcrel", "$dst, $tmp, $src">, @@ -1462,28 +1548,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0,      isAsmParserOnly = 1 in {  def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],                            "la.got", "$dst, $src">; +def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], +                             "la.tls.ie", "$dst, $src">; +def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], +                             "la.tls.ld", "$dst, $src">; +def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], +                             "la.tls.gd", "$dst, $src">; +let Defs = [R20], Size = 20 in {  def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst),                                  (ins GPR:$tmp, bare_symbol:$src), [],                                  "la.got", "$dst, $tmp, $src">,                           Requires<[IsLA64]>; -def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], -                             "la.tls.ie", "$dst, $src">;  def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst),                                     (ins GPR:$tmp, bare_symbol:$src), [],                                     "la.tls.ie", "$dst, $tmp, $src">,                              Requires<[IsLA64]>; -def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], -                             "la.tls.ld", "$dst, $src">;  def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst),                                     (ins GPR:$tmp, bare_symbol:$src), [],                                     "la.tls.ld", "$dst, $tmp, $src">,                              Requires<[IsLA64]>; -def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], -                             "la.tls.gd", "$dst, $src">;  def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst),                                     (ins GPR:$tmp, bare_symbol:$src), [],                                     "la.tls.gd", "$dst, $tmp, $src">,                              Requires<[IsLA64]>; +} // Defs = [R20], Size = 20  }  // Load address inst alias: "la", "la.global" and "la.local". diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp index 5daa9481c907..98ad49f25e3f 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp @@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,    case LoongArchII::MO_GD_PC_HI:      Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20;      break; +  case LoongArchII::MO_CALL36: +    Kind = LoongArchMCExpr::VK_LoongArch_CALL36; +    break;      // TODO: Handle more target-flags.    } diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp index 257b947a3ce4..092b5f1fb442 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp @@ -15,6 +15,7 @@  #include "LoongArch.h"  #include "LoongArchInstrInfo.h"  #include "LoongArchSubtarget.h" +#include "MCTargetDesc/LoongArchBaseInfo.h"  #include "MCTargetDesc/LoongArchMCTargetDesc.h"  #include "llvm/CodeGen/MachineFrameInfo.h"  #include "llvm/CodeGen/MachineFunction.h" @@ -194,3 +195,25 @@ bool LoongArchRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed());    return false;  } + +bool LoongArchRegisterInfo::canRealignStack(const MachineFunction &MF) const { +  if (!TargetRegisterInfo::canRealignStack(MF)) +    return false; + +  const MachineRegisterInfo *MRI = &MF.getRegInfo(); +  const LoongArchFrameLowering *TFI = getFrameLowering(MF); + +  // Stack realignment requires a frame pointer.  If we already started +  // register allocation with frame pointer elimination, it is too late now. +  if (!MRI->canReserveReg(LoongArch::R22)) +    return false; + +  // We may also need a base pointer if there are dynamic allocas or stack +  // pointer adjustments around calls. +  if (TFI->hasReservedCallFrame(MF)) +    return true; + +  // A base pointer is required and allowed.  Check that it isn't too late to +  // reserve it. +  return MRI->canReserveReg(LoongArchABI::getBPReg()); +} diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h index 7e8f26b14097..d1e40254c297 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h @@ -51,6 +51,7 @@ struct LoongArchRegisterInfo : public LoongArchGenRegisterInfo {    bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {      return true;    } +  bool canRealignStack(const MachineFunction &MF) const override;  };  } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index a5a4d78aceee..62ae1dea00d6 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT,    switch (*CM) {    case CodeModel::Small: -  case CodeModel::Medium:      return *CM; +  case CodeModel::Medium:    case CodeModel::Large:      if (!TT.isArch64Bit()) -      report_fatal_error("Large code model requires LA64"); +      report_fatal_error("Medium/Large code model requires LA64");      return *CM;    default:      report_fatal_error( diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 6d8ef1bf96cb..518f6b10edab 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -91,6 +91,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,    case FK_Data_2:    case FK_Data_4:    case FK_Data_8: +  case FK_Data_leb128:      return Value;    case LoongArch::fixup_loongarch_b16: {      if (!isInt<18>(Value)) @@ -128,6 +129,15 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,    }  } +static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, +                        MutableArrayRef<char> Data, uint64_t Value) { +  unsigned I; +  for (I = 0; I != Data.size() && Value; ++I, Value >>= 7) +    Data[I] |= uint8_t(Value & 0x7f); +  if (Value) +    Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!"); +} +  void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,                                       const MCFixup &Fixup,                                       const MCValue &Target, @@ -143,6 +153,10 @@ void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,    MCFixupKindInfo Info = getFixupKindInfo(Kind);    MCContext &Ctx = Asm.getContext(); +  // Fixup leb128 separately. +  if (Fixup.getTargetKind() == FK_Data_leb128) +    return fixupLeb128(Ctx, Fixup, Data, Value); +    // Apply any target-specific value adjustments.    Value = adjustFixupValue(Fixup, Value, Ctx); @@ -173,6 +187,7 @@ bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,    case FK_Data_2:    case FK_Data_4:    case FK_Data_8: +  case FK_Data_leb128:      return !Target.isAbsolute();    }  } @@ -202,9 +217,24 @@ getRelocPairForSize(unsigned Size) {      return std::make_pair(          MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD64),          MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB64)); +  case 128: +    return std::make_pair( +        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD_ULEB128), +        MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB_ULEB128));    }  } +std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCLEBFragment &LF, +                                                       MCAsmLayout &Layout, +                                                       int64_t &Value) const { +  const MCExpr &Expr = LF.getValue(); +  if (LF.isSigned() || !Expr.evaluateKnownAbsolute(Value, Layout)) +    return std::make_pair(false, false); +  LF.getFixups().push_back( +      MCFixup::create(0, &Expr, FK_Data_leb128, Expr.getLoc())); +  return std::make_pair(true, true); +} +  bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,                                         const MCSubtargetInfo *STI) const {    // We mostly follow binutils' convention here: align to 4-byte boundary with a @@ -226,21 +256,27 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout,                                                    uint64_t &FixedValue) const {    std::pair<MCFixupKind, MCFixupKind> FK;    uint64_t FixedValueA, FixedValueB; -  const MCSection &SecA = Target.getSymA()->getSymbol().getSection(); -  const MCSection &SecB = Target.getSymB()->getSymbol().getSection(); - -  // We need record relocation if SecA != SecB. Usually SecB is same as the -  // section of Fixup, which will be record the relocation as PCRel. If SecB -  // is not same as the section of Fixup, it will report error. Just return -  // false and then this work can be finished by handleFixup. -  if (&SecA != &SecB) -    return false; - -  // In SecA == SecB case. If the linker relaxation is enabled, we need record -  // the ADD, SUB relocations. Otherwise the FixedValue has already been -  // calculated out in evaluateFixup, return true and avoid record relocations. -  if (!STI.hasFeature(LoongArch::FeatureRelax)) -    return true; +  const MCSymbol &SA = Target.getSymA()->getSymbol(); +  const MCSymbol &SB = Target.getSymB()->getSymbol(); + +  bool force = !SA.isInSection() || !SB.isInSection(); +  if (!force) { +    const MCSection &SecA = SA.getSection(); +    const MCSection &SecB = SB.getSection(); + +    // We need record relocation if SecA != SecB. Usually SecB is same as the +    // section of Fixup, which will be record the relocation as PCRel. If SecB +    // is not same as the section of Fixup, it will report error. Just return +    // false and then this work can be finished by handleFixup. +    if (&SecA != &SecB) +      return false; + +    // In SecA == SecB case. If the linker relaxation is enabled, we need record +    // the ADD, SUB relocations. Otherwise the FixedValue has already been calc- +    // ulated out in evaluateFixup, return true and avoid record relocations. +    if (!STI.hasFeature(LoongArch::FeatureRelax)) +      return true; +  }    switch (Fixup.getKind()) {    case llvm::FK_Data_1: @@ -255,6 +291,9 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout,    case llvm::FK_Data_8:      FK = getRelocPairForSize(64);      break; +  case llvm::FK_Data_leb128: +    FK = getRelocPairForSize(128); +    break;    default:      llvm_unreachable("unsupported fixup size");    } diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index fef0e84600a7..71977217f59b 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -66,6 +66,9 @@ public:    void relaxInstruction(MCInst &Inst,                          const MCSubtargetInfo &STI) const override {} +  std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, +                                    int64_t &Value) const override; +    bool writeNopData(raw_ostream &OS, uint64_t Count,                      const MCSubtargetInfo *STI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h index cee6dad1f095..0692cb92b694 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h @@ -47,6 +47,7 @@ enum {    MO_IE_PC64_HI,    MO_LD_PC_HI,    MO_GD_PC_HI, +  MO_CALL36    // TODO: Add more flags.  };  } // end namespace LoongArchII diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index fe19a4f2d3c8..1dec816f3473 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx,      return ELF::R_LARCH_TLS_LE64_LO20;    case LoongArch::fixup_loongarch_tls_le64_hi12:      return ELF::R_LARCH_TLS_LE64_HI12; +  case LoongArch::fixup_loongarch_call36: +    return ELF::R_LARCH_CALL36;      // TODO: Handle more fixup-kinds.    }  } diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h index 178fa6e5262b..e827bae1f3e3 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h @@ -108,7 +108,10 @@ enum Fixups {    // 20-bit fixup corresponding to %gd_hi20(foo) for instruction lu12i.w.    fixup_loongarch_tls_gd_hi20,    // Generate an R_LARCH_RELAX which indicates the linker may relax here. -  fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX +  fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX, +  // 36-bit fixup corresponding to %call36(foo) for a pair instructions: +  // pcaddu18i+jirl. +  fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,  };  } // end namespace LoongArch  } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp index d2ea062dc09a..9ac0128f2517 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp @@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,      case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20:        FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20;        break; +    case LoongArchMCExpr::VK_LoongArch_CALL36: +      FixupKind = LoongArch::fixup_loongarch_call36; +      break;      }    } else if (Kind == MCExpr::SymbolRef &&               cast<MCSymbolRefExpr>(Expr)->getKind() == diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp index 82c992b1cc8c..8ca8876a19b9 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp @@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) {      return "gd_pc_hi20";    case VK_LoongArch_TLS_GD_HI20:      return "gd_hi20"; +  case VK_LoongArch_CALL36: +    return "call36";    }  } @@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) {        .Case("ld_hi20", VK_LoongArch_TLS_LD_HI20)        .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20)        .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20) +      .Case("call36", VK_LoongArch_CALL36)        .Default(VK_LoongArch_Invalid);  } diff --git a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h index 93251f824103..bd828116d7fa 100644 --- a/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h +++ b/contrib/llvm-project/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h @@ -61,6 +61,7 @@ public:      VK_LoongArch_TLS_LD_HI20,      VK_LoongArch_TLS_GD_PC_HI20,      VK_LoongArch_TLS_GD_HI20, +    VK_LoongArch_CALL36,      VK_LoongArch_Invalid // Must be the last item.    }; diff --git a/contrib/llvm-project/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h index a10401ed1a9a..cbe30ec494c9 100644 --- a/contrib/llvm-project/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h @@ -20,7 +20,6 @@ namespace llvm {  class M68kSubtarget; -/// This struct provides the information for the target register banks.  struct M68kLegalizerInfo : public LegalizerInfo {  public:    M68kLegalizerInfo(const M68kSubtarget &ST); diff --git a/contrib/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.cpp index c4d7a0dec7f3..158393f02a24 100644 --- a/contrib/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/M68k/M68kISelLowering.cpp @@ -2375,7 +2375,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {    // a >= b ? -1 :  0 -> RES = setcc_carry    // a >= b ?  0 : -1 -> RES = ~setcc_carry    if (Cond.getOpcode() == M68kISD::SUB) { -    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); +    unsigned CondCode = CC->getAsZExtVal();      if ((CondCode == M68k::COND_CC || CondCode == M68k::COND_CS) &&          (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && @@ -2491,7 +2491,7 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {        Cond = Cmp;        AddTest = false;      } else { -      switch (cast<ConstantSDNode>(CC)->getZExtValue()) { +      switch (CC->getAsZExtVal()) {        default:          break;        case M68k::COND_VS: diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp index 660861a5d521..efb23b1a4e3f 100644 --- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp @@ -308,12 +308,12 @@ static bool isValidIndexedLoad(const LoadSDNode *LD) {    switch (VT.getSimpleVT().SimpleTy) {    case MVT::i8: -    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1) +    if (LD->getOffset()->getAsZExtVal() != 1)        return false;      break;    case MVT::i16: -    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2) +    if (LD->getOffset()->getAsZExtVal() != 2)        return false;      break; diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index d3b59138a5a9..e68904863cfc 100644 --- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -333,6 +333,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,    setMinFunctionAlignment(Align(2));    setPrefFunctionAlignment(Align(2)); +  setMaxAtomicSizeInBitsSupported(0);  }  SDValue MSP430TargetLowering::LowerOperation(SDValue Op, @@ -1168,8 +1169,8 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {    bool Invert = false;    bool Shift = false;    bool Convert = true; -  switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) { -   default: +  switch (TargetCC->getAsZExtVal()) { +  default:      Convert = false;      break;     case MSP430CC::COND_HS: @@ -1193,7 +1194,7 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {       // C = ~Z for AND instruction, thus we can put Res = ~(SR & 1), however,       // Res = (SR >> 1) & 1 is 1 word shorter.       break; -  } +   }    EVT VT = Op.getValueType();    SDValue One  = DAG.getConstant(1, dl, VT);    if (Convert) { diff --git a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp index 39e0658eb70d..283de46e57d5 100644 --- a/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -65,6 +65,7 @@ public:      return getTM<MSP430TargetMachine>();    } +  void addIRPasses() override;    bool addInstSelector() override;    void addPreEmitPass() override;  }; @@ -81,6 +82,12 @@ MachineFunctionInfo *MSP430TargetMachine::createMachineFunctionInfo(                                                                        F, STI);  } +void MSP430PassConfig::addIRPasses() { +  addPass(createAtomicExpandPass()); + +  TargetPassConfig::addIRPasses(); +} +  bool MSP430PassConfig::addInstSelector() {    // Install an instruction selector.    addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel())); diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsFastISel.cpp index 7fcf375aa10b..192ed1cec79a 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -492,7 +492,7 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) {          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();          TmpOffset += SL->getElementOffset(Idx);        } else { -        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); +        uint64_t S = GTI.getSequentialElementStride(DL);          while (true) {            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {              // Constant-offset addressing. diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp index 483eba4e4f47..d431d3d91494 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -2042,8 +2042,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {      return Op;    SDValue CCNode  = CondRes.getOperand(2); -  Mips::CondCode CC = -    (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue(); +  Mips::CondCode CC = (Mips::CondCode)CCNode->getAsZExtVal();    unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T;    SDValue BrCode = DAG.getConstant(Opc, DL, MVT::i32);    SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32); diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp index 14f26201e6c0..f5e94235859a 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -330,8 +330,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {    verify(*ST.getInstrInfo());  } -bool MipsLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, -                                       MachineInstr &MI) const { +bool MipsLegalizerInfo::legalizeCustom( +    LegalizerHelper &Helper, MachineInstr &MI, +    LostDebugLocObserver &LocObserver) const {    using namespace TargetOpcode;    MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.h index 05027b718a85..63daebf26470 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsLegalizerInfo.h @@ -25,7 +25,8 @@ class MipsLegalizerInfo : public LegalizerInfo {  public:    MipsLegalizerInfo(const MipsSubtarget &ST); -  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; +  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, +                      LostDebugLocObserver &LocObserver) const override;    bool legalizeIntrinsic(LegalizerHelper &Helper,                           MachineInstr &MI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 0ed87ee0809a..c0e978018919 100644 --- a/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -76,7 +76,7 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,  }  unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const { -  uint64_t RegNum = cast<ConstantSDNode>(RegIdx)->getZExtValue(); +  uint64_t RegNum = RegIdx->getAsZExtVal();    return Mips::MSACtrlRegClass.getRegister(RegNum);  } diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 815c46edb6fa..7abe984b34e1 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2076,7 +2076,7 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {      VTs = CurDAG->getVTList(EVTs);    } -  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); +  unsigned OffsetVal = Offset->getAsZExtVal();    SmallVector<SDValue, 2> Ops;    Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); @@ -2091,7 +2091,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {    SDLoc DL(N);    SDValue Chain = N->getOperand(0);    SDValue Offset = N->getOperand(1); -  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); +  unsigned OffsetVal = Offset->getAsZExtVal();    MemSDNode *Mem = cast<MemSDNode>(N);    // How many elements do we have? @@ -2158,9 +2158,9 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {    SDLoc DL(N);    SDValue Chain = N->getOperand(0);    SDValue Param = N->getOperand(1); -  unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue(); +  unsigned ParamVal = Param->getAsZExtVal();    SDValue Offset = N->getOperand(2); -  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); +  unsigned OffsetVal = Offset->getAsZExtVal();    MemSDNode *Mem = cast<MemSDNode>(N);    SDValue Glue = N->getOperand(N->getNumOperands() - 1); diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index e8f36bf50a1b..c65090d915ef 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -854,6 +854,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,    computeRegisterProperties(STI.getRegisterInfo());    setMinCmpXchgSizeInBits(32); +  setMaxAtomicSizeInBitsSupported(64);  }  const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -5811,7 +5812,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,    SDLoc DL(N);    // Get the intrinsic ID -  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); +  unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();    switch (IntrinNo) {    default:      return; diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 8d895762fbe1..fad69f5e80a7 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -225,7 +225,8 @@ void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {    AAM.registerFunctionAnalysis<NVPTXAA>();  } -void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void NVPTXTargetMachine::registerPassBuilderCallbacks( +    PassBuilder &PB, bool PopulateClassToPassNames) {    PB.registerPipelineParsingCallback(        [](StringRef PassName, FunctionPassManager &PM,           ArrayRef<PassBuilder::PipelineElement>) { diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h index cfdd8da9b765..9e6bf929badb 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -69,7 +69,8 @@ public:    void registerDefaultAliasAnalyses(AAManager &AAM) override; -  void registerPassBuilderCallbacks(PassBuilder &PB) override; +  void registerPassBuilderCallbacks(PassBuilder &PB, +                                    bool PopulateClassToPassNames) override;    TargetTransformInfo getTargetTransformInfo(const Function &F) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index c73721da46e3..7aa63f9fc0c9 100644 --- a/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -180,10 +180,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {        return {Intrinsic::ceil, FTZ_MustBeOn};      case Intrinsic::nvvm_fabs_d:        return {Intrinsic::fabs, FTZ_Any}; -    case Intrinsic::nvvm_fabs_f: -      return {Intrinsic::fabs, FTZ_MustBeOff}; -    case Intrinsic::nvvm_fabs_ftz_f: -      return {Intrinsic::fabs, FTZ_MustBeOn};      case Intrinsic::nvvm_floor_d:        return {Intrinsic::floor, FTZ_Any};      case Intrinsic::nvvm_floor_f: @@ -264,12 +260,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {        return {Intrinsic::minimum, FTZ_MustBeOff, true};      case Intrinsic::nvvm_fmin_ftz_nan_f16x2:        return {Intrinsic::minimum, FTZ_MustBeOn, true}; -    case Intrinsic::nvvm_round_d: -      return {Intrinsic::round, FTZ_Any}; -    case Intrinsic::nvvm_round_f: -      return {Intrinsic::round, FTZ_MustBeOff}; -    case Intrinsic::nvvm_round_ftz_f: -      return {Intrinsic::round, FTZ_MustBeOn};      case Intrinsic::nvvm_sqrt_rn_d:        return {Intrinsic::sqrt, FTZ_Any};      case Intrinsic::nvvm_sqrt_f: @@ -278,10 +268,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {        // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are        // the versions with explicit ftz-ness.        return {Intrinsic::sqrt, FTZ_Any}; -    case Intrinsic::nvvm_sqrt_rn_f: -      return {Intrinsic::sqrt, FTZ_MustBeOff}; -    case Intrinsic::nvvm_sqrt_rn_ftz_f: -      return {Intrinsic::sqrt, FTZ_MustBeOn};      case Intrinsic::nvvm_trunc_d:        return {Intrinsic::trunc, FTZ_Any};      case Intrinsic::nvvm_trunc_f: @@ -316,24 +302,8 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {        return {Instruction::UIToFP};      // NVVM intrinsics that map to LLVM binary ops. -    case Intrinsic::nvvm_add_rn_d: -      return {Instruction::FAdd, FTZ_Any}; -    case Intrinsic::nvvm_add_rn_f: -      return {Instruction::FAdd, FTZ_MustBeOff}; -    case Intrinsic::nvvm_add_rn_ftz_f: -      return {Instruction::FAdd, FTZ_MustBeOn}; -    case Intrinsic::nvvm_mul_rn_d: -      return {Instruction::FMul, FTZ_Any}; -    case Intrinsic::nvvm_mul_rn_f: -      return {Instruction::FMul, FTZ_MustBeOff}; -    case Intrinsic::nvvm_mul_rn_ftz_f: -      return {Instruction::FMul, FTZ_MustBeOn};      case Intrinsic::nvvm_div_rn_d:        return {Instruction::FDiv, FTZ_Any}; -    case Intrinsic::nvvm_div_rn_f: -      return {Instruction::FDiv, FTZ_MustBeOff}; -    case Intrinsic::nvvm_div_rn_ftz_f: -      return {Instruction::FDiv, FTZ_MustBeOn};      // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but      // need special handling. @@ -342,10 +312,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) {      // as well.      case Intrinsic::nvvm_rcp_rn_d:        return {SPC_Reciprocal, FTZ_Any}; -    case Intrinsic::nvvm_rcp_rn_f: -      return {SPC_Reciprocal, FTZ_MustBeOff}; -    case Intrinsic::nvvm_rcp_rn_ftz_f: -      return {SPC_Reciprocal, FTZ_MustBeOn};        // We do not currently simplify intrinsics that give an approximate        // answer. These include: diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 42f5a4e624c4..56af80f9cede 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -350,7 +350,7 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {            unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();            TmpOffset += SL->getElementOffset(Idx);          } else { -          uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); +          uint64_t S = GTI.getSequentialElementStride(DL);            for (;;) {              if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {                // Constant-offset addressing. diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index ed96339240d9..26ed74108ec3 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -565,7 +565,7 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {  /// operand. If so Imm will receive the 32-bit value.  static bool isInt32Immediate(SDNode *N, unsigned &Imm) {    if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { -    Imm = cast<ConstantSDNode>(N)->getZExtValue(); +    Imm = N->getAsZExtVal();      return true;    }    return false; @@ -575,7 +575,7 @@ static bool isInt32Immediate(SDNode *N, unsigned &Imm) {  /// operand.  If so Imm will receive the 64-bit value.  static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {    if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) { -    Imm = cast<ConstantSDNode>(N)->getZExtValue(); +    Imm = N->getAsZExtVal();      return true;    }    return false; @@ -1500,7 +1500,7 @@ static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) {    SDLoc dl(N);    // Get 64 bit value. -  int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue(); +  int64_t Imm = N->getAsZExtVal();    if (unsigned MinSize = allUsesTruncate(CurDAG, N)) {      uint64_t SextImm = SignExtend64(Imm, MinSize);      SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64); @@ -4923,7 +4923,7 @@ bool PPCDAGToDAGISel::trySelectLoopCountIntrinsic(SDNode *N) {    SDNode *NewDecrement = CurDAG->getMachineNode(DecrementOpcode, DecrementLoc,                                                  MVT::i1, DecrementOps); -  unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); +  unsigned Val = RHS->getAsZExtVal();    bool IsBranchOnTrue = (CC == ISD::SETEQ && Val) || (CC == ISD::SETNE && !Val);    unsigned Opcode = IsBranchOnTrue ? PPC::BC : PPC::BCn; @@ -5765,7 +5765,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {        break;      // If the multiplier fits int16, we can handle it with mulli. -    int64_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue(); +    int64_t Imm = Op1->getAsZExtVal();      unsigned Shift = llvm::countr_zero<uint64_t>(Imm);      if (isInt<16>(Imm) || !Shift)        break; @@ -6612,8 +6612,7 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {      // For us to materialize these using one instruction, we must be able to      // represent them as signed 16-bit integers. -    uint64_t True  = cast<ConstantSDNode>(TrueRes)->getZExtValue(), -             False = cast<ConstantSDNode>(FalseRes)->getZExtValue(); +    uint64_t True = TrueRes->getAsZExtVal(), False = FalseRes->getAsZExtVal();      if (!isInt<16>(True) || !isInt<16>(False))        break; diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8f27e6677afa..235df1880b37 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2566,7 +2566,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {      if (LeadingZero) {        if (!UniquedVals[Multiple-1].getNode())          return DAG.getTargetConstant(0, SDLoc(N), MVT::i32);  // 0,0,0,undef -      int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); +      int Val = UniquedVals[Multiple - 1]->getAsZExtVal();        if (Val < 16)                                   // 0,0,0,4 -> vspltisw(4)          return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);      } @@ -2635,11 +2635,11 @@ bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {    if (!isa<ConstantSDNode>(N))      return false; -  Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); +  Imm = (int16_t)N->getAsZExtVal();    if (N->getValueType(0) == MVT::i32) -    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); +    return Imm == (int32_t)N->getAsZExtVal();    else -    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); +    return Imm == (int64_t)N->getAsZExtVal();  }  bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {    return isIntS16Immediate(Op.getNode(), Imm); @@ -2684,7 +2684,7 @@ bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {    if (!isa<ConstantSDNode>(N))      return false; -  Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); +  Imm = (int64_t)N->getAsZExtVal();    return isInt<34>(Imm);  }  bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) { @@ -15580,7 +15580,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,          NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR)        break; -    uint64_t Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +    uint64_t Imm = Op2->getAsZExtVal();      // Make sure that the constant is narrow enough to fit in the narrow type.      if (!isUInt<32>(Imm))        break; @@ -16795,7 +16795,7 @@ void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I,      return;    if (!isa<ConstantSDNode>(Ops[1].getNode()))      return; -  auto IntrinsicID = cast<ConstantSDNode>(Ops[1].getNode())->getZExtValue(); +  auto IntrinsicID = Ops[1].getNode()->getAsZExtVal();    if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw &&        IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap)      return; @@ -18430,7 +18430,7 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,      if (Flags & PPC::MOF_RPlusSImm16) {        SDValue Op0 = N.getOperand(0);        SDValue Op1 = N.getOperand(1); -      int16_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue(); +      int16_t Imm = Op1->getAsZExtVal();        if (!Align || isAligned(*Align, Imm)) {          Disp = DAG.getTargetConstant(Imm, DL, N.getValueType());          Base = Op0; diff --git a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index 375e63654db1..8a37e40414ee 100644 --- a/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -270,12 +270,15 @@ def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;  // Link register  def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>; -//let Aliases = [LR] in -def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>; +def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]> { +  let Aliases = [LR]; +}  // Count register  def CTR  : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>; -def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>; +def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]> { +  let Aliases = [CTR]; +}  // VRsave register  def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 4759aa951664..d616aaeddf41 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -466,10 +466,6 @@ public:    bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } -  bool isGPRF64AsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } - -  bool isGPRPF64AsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } -    static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,                                    RISCVMCExpr::VariantKind &VK) {      if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) { @@ -2039,9 +2035,8 @@ ParseStatus RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {    SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size()); -  RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL; -  if (Identifier.consume_back("@plt")) -    Kind = RISCVMCExpr::VK_RISCV_CALL_PLT; +  RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL_PLT; +  (void)Identifier.consume_back("@plt");    MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);    Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 50ed85acdec0..697ad476ff8c 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -579,7 +579,7 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,    // Select the recommended relocation type R_RISCV_CALL_PLT.    if (!Info.Callee.isReg()) -    Info.Callee.setTargetFlags(RISCVII::MO_PLT); +    Info.Callee.setTargetFlags(RISCVII::MO_CALL);    MachineInstrBuilder Call =        MIRBuilder diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 079906d1958c..ab8070772fe5 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -113,7 +113,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)    getActionDefinitionsBuilder(G_BITREVERSE).maxScalar(0, sXLen).lower();    auto &BSWAPActions = getActionDefinitionsBuilder(G_BSWAP); -  if (ST.hasStdExtZbb()) +  if (ST.hasStdExtZbb() || ST.hasStdExtZbkb())      BSWAPActions.legalFor({sXLen}).clampScalar(0, sXLen, sXLen);    else      BSWAPActions.maxScalar(0, sXLen).lower(); @@ -411,8 +411,9 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,    return true;  } -bool RISCVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, -                                        MachineInstr &MI) const { +bool RISCVLegalizerInfo::legalizeCustom( +    LegalizerHelper &Helper, MachineInstr &MI, +    LostDebugLocObserver &LocObserver) const {    MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;    GISelChangeObserver &Observer = Helper.Observer;    switch (MI.getOpcode()) { diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h index 48c36976501f..f3ec6be16734 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h @@ -21,7 +21,6 @@ class GISelChangeObserver;  class MachineIRBuilder;  class RISCVSubtarget; -/// This class provides the information for the target register banks.  class RISCVLegalizerInfo : public LegalizerInfo {    const RISCVSubtarget &STI;    const unsigned XLen; @@ -30,7 +29,8 @@ class RISCVLegalizerInfo : public LegalizerInfo {  public:    RISCVLegalizerInfo(const RISCVSubtarget &ST); -  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; +  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, +                      LostDebugLocObserver &LocObserver) const override;    bool legalizeIntrinsic(LegalizerHelper &Helper,                           MachineInstr &MI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp index aba2511959af..8d97c5ffd20a 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp @@ -186,30 +186,37 @@ RISCVInstrumentManager::createInstruments(const MCInst &Inst) {  }  static std::pair<uint8_t, uint8_t> -getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL, -                                    uint8_t SEW) { +getEEWAndEMUL(unsigned Opcode, RISCVII::VLMUL LMUL, uint8_t SEW) {    uint8_t EEW;    switch (Opcode) {    case RISCV::VLM_V:    case RISCV::VSM_V:    case RISCV::VLE8_V:    case RISCV::VSE8_V: +  case RISCV::VLSE8_V: +  case RISCV::VSSE8_V:      EEW = 8;      break;    case RISCV::VLE16_V:    case RISCV::VSE16_V: +  case RISCV::VLSE16_V: +  case RISCV::VSSE16_V:      EEW = 16;      break;    case RISCV::VLE32_V:    case RISCV::VSE32_V: +  case RISCV::VLSE32_V: +  case RISCV::VSSE32_V:      EEW = 32;      break;    case RISCV::VLE64_V:    case RISCV::VSE64_V: +  case RISCV::VLSE64_V: +  case RISCV::VSSE64_V:      EEW = 64;      break;    default: -    llvm_unreachable("Opcode is not a vector unit stride load nor store"); +    llvm_unreachable("Could not determine EEW from Opcode");    }    auto EMUL = RISCVVType::getSameRatioLMUL(SEW, LMUL, EEW); @@ -218,6 +225,18 @@ getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL,    return std::make_pair(EEW, *EMUL);  } +bool opcodeHasEEWAndEMULInfo(unsigned short Opcode) { +  return Opcode == RISCV::VLM_V || Opcode == RISCV::VSM_V || +         Opcode == RISCV::VLE8_V || Opcode == RISCV::VSE8_V || +         Opcode == RISCV::VLE16_V || Opcode == RISCV::VSE16_V || +         Opcode == RISCV::VLE32_V || Opcode == RISCV::VSE32_V || +         Opcode == RISCV::VLE64_V || Opcode == RISCV::VSE64_V || +         Opcode == RISCV::VLSE8_V || Opcode == RISCV::VSSE8_V || +         Opcode == RISCV::VLSE16_V || Opcode == RISCV::VSSE16_V || +         Opcode == RISCV::VLSE32_V || Opcode == RISCV::VSSE32_V || +         Opcode == RISCV::VLSE64_V || Opcode == RISCV::VSSE64_V; +} +  unsigned RISCVInstrumentManager::getSchedClassID(      const MCInstrInfo &MCII, const MCInst &MCI,      const llvm::SmallVector<Instrument *> &IVec) const { @@ -249,13 +268,9 @@ unsigned RISCVInstrumentManager::getSchedClassID(    uint8_t SEW = SI ? SI->getSEW() : 0;    const RISCVVInversePseudosTable::PseudoInfo *RVV = nullptr; -  if (Opcode == RISCV::VLM_V || Opcode == RISCV::VSM_V || -      Opcode == RISCV::VLE8_V || Opcode == RISCV::VSE8_V || -      Opcode == RISCV::VLE16_V || Opcode == RISCV::VSE16_V || -      Opcode == RISCV::VLE32_V || Opcode == RISCV::VSE32_V || -      Opcode == RISCV::VLE64_V || Opcode == RISCV::VSE64_V) { +  if (opcodeHasEEWAndEMULInfo(Opcode)) {      RISCVII::VLMUL VLMUL = static_cast<RISCVII::VLMUL>(LMUL); -    auto [EEW, EMUL] = getEEWAndEMULForUnitStrideLoadStore(Opcode, VLMUL, SEW); +    auto [EEW, EMUL] = getEEWAndEMUL(Opcode, VLMUL, SEW);      RVV = RISCVVInversePseudosTable::getBaseInfo(Opcode, EMUL, EEW);    } else {      // Check if it depends on LMUL and SEW diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 716fb67c5824..7ce08eabdeb6 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -329,16 +329,17 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF,    return true;  } -bool RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, -                                  int64_t &Value) const { +std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF, +                                                   MCAsmLayout &Layout, +                                                   int64_t &Value) const {    if (LF.isSigned()) -    return false; +    return std::make_pair(false, false);    const MCExpr &Expr = LF.getValue();    if (ULEB128Reloc) {      LF.getFixups().push_back(          MCFixup::create(0, &Expr, FK_Data_leb128, Expr.getLoc()));    } -  return Expr.evaluateKnownAbsolute(Value, Layout); +  return std::make_pair(Expr.evaluateKnownAbsolute(Value, Layout), false);  }  // Given a compressed control flow instruction this function returns diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h index 2ad6534ac8bc..902b44bba70f 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -100,8 +100,8 @@ public:                            bool &WasRelaxed) const override;    bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, MCAsmLayout &Layout,                       bool &WasRelaxed) const override; -  bool relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, -                   int64_t &Value) const override; +  std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, +                                    int64_t &Value) const override;    bool writeNopData(raw_ostream &OS, uint64_t Count,                      const MCSubtargetInfo *STI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index c32210fc1419..433e2e6f80bd 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -254,7 +254,6 @@ static inline bool isFirstDefTiedToFirstUse(const MCInstrDesc &Desc) {  enum {    MO_None = 0,    MO_CALL = 1, -  MO_PLT = 2,    MO_LO = 3,    MO_HI = 4,    MO_PCREL_LO = 5, diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index d67351102bc1..64ddae61b1bc 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -41,8 +41,6 @@ void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {    if (HasVariant)      OS << '%' << getVariantKindName(getKind()) << '(';    Expr->print(OS, MAI); -  if (Kind == VK_RISCV_CALL_PLT) -    OS << "@plt";    if (HasVariant)      OS << ')';  } diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 0fd514fa87cd..f2bd5118fc07 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -747,9 +747,6 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,      Kind = RISCVMCExpr::VK_RISCV_None;      break;    case RISCVII::MO_CALL: -    Kind = RISCVMCExpr::VK_RISCV_CALL; -    break; -  case RISCVII::MO_PLT:      Kind = RISCVMCExpr::VK_RISCV_CALL_PLT;      break;    case RISCVII::MO_LO: diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 24a13f93af88..103a2e2da7b9 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -109,6 +109,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,      return expandRV32ZdinxStore(MBB, MBBI);    case RISCV::PseudoRV32ZdinxLD:      return expandRV32ZdinxLoad(MBB, MBBI); +  case RISCV::PseudoCCMOVGPRNoX0:    case RISCV::PseudoCCMOVGPR:    case RISCV::PseudoCCADD:    case RISCV::PseudoCCSUB: @@ -134,6 +135,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,    case RISCV::PseudoCCSLLIW:    case RISCV::PseudoCCSRLIW:    case RISCV::PseudoCCSRAIW: +  case RISCV::PseudoCCANDN: +  case RISCV::PseudoCCORN: +  case RISCV::PseudoCCXNOR:      return expandCCOp(MBB, MBBI, NextMBBI);    case RISCV::PseudoVSETVLI:    case RISCV::PseudoVSETVLIX0: @@ -191,7 +195,8 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,    Register DestReg = MI.getOperand(0).getReg();    assert(MI.getOperand(4).getReg() == DestReg); -  if (MI.getOpcode() == RISCV::PseudoCCMOVGPR) { +  if (MI.getOpcode() == RISCV::PseudoCCMOVGPR || +      MI.getOpcode() == RISCV::PseudoCCMOVGPRNoX0) {      // Add MV.      BuildMI(TrueBB, DL, TII->get(RISCV::ADDI), DestReg)          .add(MI.getOperand(5)) @@ -225,6 +230,9 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,      case RISCV::PseudoCCSLLIW: NewOpc = RISCV::SLLIW; break;      case RISCV::PseudoCCSRLIW: NewOpc = RISCV::SRLIW; break;      case RISCV::PseudoCCSRAIW: NewOpc = RISCV::SRAIW; break; +    case RISCV::PseudoCCANDN:  NewOpc = RISCV::ANDN;  break; +    case RISCV::PseudoCCORN:   NewOpc = RISCV::ORN;   break; +    case RISCV::PseudoCCXNOR:  NewOpc = RISCV::XNOR;  break;      }      BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg)          .add(MI.getOperand(5)) diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFeatures.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFeatures.td index 59b202606dad..bb7a3291085d 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1021,6 +1021,12 @@ def TuneShortForwardBranchOpt  def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">;  def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">; +def TuneConditionalCompressedMoveFusion +    : SubtargetFeature<"conditional-cmv-fusion", "HasConditionalCompressedMoveFusion", +                       "true", "Enable branch+c.mv fusion">; +def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">; +def NoConditionalMoveFusion  : Predicate<"!Subtarget->hasConditionalMoveFusion()">; +  def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",                                     "SiFive 7-Series processors",                                     [TuneNoDefaultUnroll, diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index 5ad1e082344e..1129206800ad 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -362,7 +362,7 @@ RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr,      VecOperand = i; -    TypeSize TS = DL->getTypeAllocSize(GTI.getIndexedType()); +    TypeSize TS = GTI.getSequentialElementStride(*DL);      if (TS.isScalable())        return std::make_pair(nullptr, nullptr); diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index bfa3bf3cc74e..0d8688ba2eae 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -763,14 +763,12 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) {      return false;    EVT LoadVT = Ld->getMemoryVT(); -  bool IsPre = (AM == ISD::PRE_INC || AM == ISD::PRE_DEC); -  bool IsPost = (AM == ISD::POST_INC || AM == ISD::POST_DEC); +  assert((AM == ISD::PRE_INC || AM == ISD::POST_INC) && +         "Unexpected addressing mode"); +  bool IsPre = AM == ISD::PRE_INC; +  bool IsPost = AM == ISD::POST_INC;    int64_t Offset = C->getSExtValue(); -  // Convert decrements to increments by a negative quantity. -  if (AM == ISD::PRE_DEC || AM == ISD::POST_DEC) -    Offset = -Offset; -    // The constants that can be encoded in the THeadMemIdx instructions    // are of the form (sign_extend(imm5) << imm2).    int64_t Shift; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 03a59f8a8b57..0a1a466af591 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -814,8 +814,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,        setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT,                           Custom);        setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); -      setOperationAction( -          {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, Legal); +      setOperationAction({ISD::AVGFLOORU, ISD::SADDSAT, ISD::UADDSAT, +                          ISD::SSUBSAT, ISD::USUBSAT}, +                         VT, Legal);        // Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR_VL"        // nodes which truncate by one power of two at a time. @@ -1184,9 +1185,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,          if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV())            setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Custom); -        setOperationAction( -            {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, -            Custom); +        setOperationAction({ISD::AVGFLOORU, ISD::SADDSAT, ISD::UADDSAT, +                            ISD::SSUBSAT, ISD::USUBSAT}, +                           VT, Custom);          setOperationAction(ISD::VSELECT, VT, Custom);          setOperationAction(ISD::SELECT_CC, VT, Expand); @@ -1350,8 +1351,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,    }    if (Subtarget.hasVendorXTHeadMemIdx()) { -    for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::POST_DEC; -         ++im) { +    for (unsigned im : {ISD::PRE_INC, ISD::POST_INC}) {        setIndexedLoadAction(im, MVT::i8, Legal);        setIndexedStoreAction(im, MVT::i8, Legal);        setIndexedLoadAction(im, MVT::i16, Legal); @@ -1374,8 +1374,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,    setPrefLoopAlignment(Subtarget.getPrefLoopAlignment());    setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, -                       ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::MUL, -                       ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT}); +                       ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND, +                       ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});    if (Subtarget.is64Bit())      setTargetDAGCombine(ISD::SRA); @@ -2711,11 +2711,19 @@ InstructionCost RISCVTargetLowering::getVRGatherVICost(MVT VT) const {    return getLMULCost(VT);  } -/// Return the cost of a vslidedown.vi/vx or vslideup.vi/vx instruction +/// Return the cost of a vslidedown.vx or vslideup.vx instruction +/// for the type VT.  (This does not cover the vslide1up or vslide1down +/// variants.)  Slides may be linear in the number of vregs implied by LMUL, +/// or may track the vrgather.vv cost. It is implementation-dependent. +InstructionCost RISCVTargetLowering::getVSlideVXCost(MVT VT) const { +  return getLMULCost(VT); +} + +/// Return the cost of a vslidedown.vi or vslideup.vi instruction  /// for the type VT.  (This does not cover the vslide1up or vslide1down  /// variants.)  Slides may be linear in the number of vregs implied by LMUL,  /// or may track the vrgather.vv cost. It is implementation-dependent. -InstructionCost RISCVTargetLowering::getVSlideCost(MVT VT) const { +InstructionCost RISCVTargetLowering::getVSlideVICost(MVT VT) const {    return getLMULCost(VT);  } @@ -2811,8 +2819,8 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,    SDValue SplatZero = DAG.getNode(        RISCVISD::VMV_V_X_VL, DL, DstContainerVT, DAG.getUNDEF(DstContainerVT),        DAG.getConstant(0, DL, Subtarget.getXLenVT()), VL); -  Res = DAG.getNode(RISCVISD::VSELECT_VL, DL, DstContainerVT, IsNan, SplatZero, -                    Res, VL); +  Res = DAG.getNode(RISCVISD::VMERGE_VL, DL, DstContainerVT, IsNan, SplatZero, +                    Res, DAG.getUNDEF(DstContainerVT), VL);    if (DstVT.isFixedLengthVector())      Res = convertFromScalableVector(DstVT, Res, DAG, Subtarget); @@ -3489,7 +3497,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,      for (unsigned I = 0; I < NumElts;) {        SDValue V = Op.getOperand(I); -      bool BitValue = !V.isUndef() && cast<ConstantSDNode>(V)->getZExtValue(); +      bool BitValue = !V.isUndef() && V->getAsZExtVal();        Bits |= ((uint64_t)BitValue << BitPos);        ++BitPos;        ++I; @@ -3620,8 +3628,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,      for (const auto &OpIdx : enumerate(Op->op_values())) {        const auto &SeqV = OpIdx.value();        if (!SeqV.isUndef()) -        SplatValue |= ((cast<ConstantSDNode>(SeqV)->getZExtValue() & EltMask) -                       << (OpIdx.index() * EltBitSize)); +        SplatValue |= +            ((SeqV->getAsZExtVal() & EltMask) << (OpIdx.index() * EltBitSize));      }      // On RV64, sign-extend from 32 to 64 bits where possible in order to @@ -3650,10 +3658,10 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,    // would require bit-manipulation instructions to construct the splat value.    SmallVector<SDValue> Sequence;    const auto *BV = cast<BuildVectorSDNode>(Op); -  if (VT.isInteger() && EltBitSize < 64 && +  if (VT.isInteger() && EltBitSize < Subtarget.getELen() &&        ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&        BV->getRepeatedSequence(Sequence) && -      (Sequence.size() * EltBitSize) <= 64) { +      (Sequence.size() * EltBitSize) <= Subtarget.getELen()) {      unsigned SeqLen = Sequence.size();      MVT ViaIntVT = MVT::getIntegerVT(EltBitSize * SeqLen);      assert((ViaIntVT == MVT::i16 || ViaIntVT == MVT::i32 || @@ -3676,8 +3684,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,      // vector type.      for (const auto &SeqV : Sequence) {        if (!SeqV.isUndef()) -        SplatValue |= ((cast<ConstantSDNode>(SeqV)->getZExtValue() & EltMask) -                       << (EltIdx * EltBitSize)); +        SplatValue |= +            ((SeqV->getAsZExtVal() & EltMask) << (EltIdx * EltBitSize));        EltIdx++;      } @@ -3938,8 +3946,7 @@ static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru,            (isa<RegisterSDNode>(VL) &&             cast<RegisterSDNode>(VL)->getReg() == RISCV::X0))          NewVL = DAG.getRegister(RISCV::X0, MVT::i32); -      else if (isa<ConstantSDNode>(VL) && -               isUInt<4>(cast<ConstantSDNode>(VL)->getZExtValue())) +      else if (isa<ConstantSDNode>(VL) && isUInt<4>(VL->getAsZExtVal()))          NewVL = DAG.getNode(ISD::ADD, DL, VL.getValueType(), VL, VL);        if (NewVL) { @@ -5401,8 +5408,8 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG,      SDValue XIsNonNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),                                      {X, X, DAG.getCondCode(ISD::SETOEQ),                                       DAG.getUNDEF(ContainerVT), Mask, VL}); -    NewY = -        DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, XIsNonNan, Y, X, VL); +    NewY = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, XIsNonNan, Y, X, +                       DAG.getUNDEF(ContainerVT), VL);    }    SDValue NewX = X; @@ -5410,8 +5417,8 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG,      SDValue YIsNonNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(),                                      {Y, Y, DAG.getCondCode(ISD::SETOEQ),                                       DAG.getUNDEF(ContainerVT), Mask, VL}); -    NewX = -        DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, YIsNonNan, X, Y, VL); +    NewX = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, YIsNonNan, X, Y, +                       DAG.getUNDEF(ContainerVT), VL);    }    unsigned Opc = @@ -5458,6 +5465,7 @@ static unsigned getRISCVVLOp(SDValue Op) {    OP_CASE(UADDSAT)    OP_CASE(SSUBSAT)    OP_CASE(USUBSAT) +  OP_CASE(AVGFLOORU)    OP_CASE(FADD)    OP_CASE(FSUB)    OP_CASE(FMUL) @@ -5528,7 +5536,6 @@ static unsigned getRISCVVLOp(SDValue Op) {        return RISCVISD::VMXOR_VL;      return RISCVISD::XOR_VL;    case ISD::VP_SELECT: -    return RISCVISD::VSELECT_VL;    case ISD::VP_MERGE:      return RISCVISD::VMERGE_VL;    case ISD::VP_ASHR: @@ -6453,6 +6460,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,           !Subtarget.hasVInstructionsF16()))        return SplitVectorOp(Op, DAG);      [[fallthrough]]; +  case ISD::AVGFLOORU:    case ISD::SADDSAT:    case ISD::UADDSAT:    case ISD::SSUBSAT: @@ -6914,7 +6922,7 @@ static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG,    MVT VT = N->getSimpleValueType(0);    SDLoc DL(N); -  if (!Subtarget.hasShortForwardBranchOpt()) { +  if (!Subtarget.hasConditionalMoveFusion()) {      // (select c, -1, y) -> -c | y      if (isAllOnesConstant(TrueV)) {        SDValue Neg = DAG.getNegative(CondV, DL, VT); @@ -7078,7 +7086,7 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {      // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c))      // Unless we have the short forward branch optimization. -    if (!Subtarget.hasShortForwardBranchOpt()) +    if (!Subtarget.hasConditionalMoveFusion())        return DAG.getNode(            ISD::OR, DL, VT,            DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV), @@ -7456,8 +7464,9 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG,                            DAG.getUNDEF(ContainerVT), SplatZero, VL);    SplatTrueVal = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,                               DAG.getUNDEF(ContainerVT), SplatTrueVal, VL); -  SDValue Select = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC, -                               SplatTrueVal, SplatZero, VL); +  SDValue Select = +      DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, SplatTrueVal, +                  SplatZero, DAG.getUNDEF(ContainerVT), VL);    return convertFromScalableVector(VecVT, Select, DAG, Subtarget);  } @@ -7906,8 +7915,7 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,    // Use tail agnostic policy if Idx is the last index of Vec.    unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;    if (VecVT.isFixedLengthVector() && isa<ConstantSDNode>(Idx) && -      cast<ConstantSDNode>(Idx)->getZExtValue() + 1 == -          VecVT.getVectorNumElements()) +      Idx->getAsZExtVal() + 1 == VecVT.getVectorNumElements())      Policy = RISCVII::TAIL_AGNOSTIC;    SDValue Slideup = getVSlideup(DAG, Subtarget, DL, ContainerVT, Vec, ValInVec,                                  Idx, Mask, InsertVL, Policy); @@ -8167,7 +8175,7 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,        const auto [MinVLMAX, MaxVLMAX] =            RISCVTargetLowering::computeVLMAXBounds(VT, Subtarget); -      uint64_t AVLInt = cast<ConstantSDNode>(AVL)->getZExtValue(); +      uint64_t AVLInt = AVL->getAsZExtVal();        if (AVLInt <= MinVLMAX) {          I32VL = DAG.getConstant(2 * AVLInt, DL, XLenVT);        } else if (AVLInt >= 2 * MaxVLMAX) { @@ -8233,15 +8241,14 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,      SDValue Mask = Operands[NumOps - 3];      SDValue MaskedOff = Operands[1];      // Assume Policy operand is the last operand. -    uint64_t Policy = -        cast<ConstantSDNode>(Operands[NumOps - 1])->getZExtValue(); +    uint64_t Policy = Operands[NumOps - 1]->getAsZExtVal();      // We don't need to select maskedoff if it's undef.      if (MaskedOff.isUndef())        return Vec;      // TAMU      if (Policy == RISCVII::TAIL_AGNOSTIC) -      return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, Mask, Vec, MaskedOff, -                         AVL); +      return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff, +                         DAG.getUNDEF(VT), AVL);      // TUMA or TUMU: Currently we always emit tumu policy regardless of tuma.      // It's fine because vmerge does not care mask policy.      return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff, @@ -8489,8 +8496,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,          DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT,                      {VID, SplattedIdx, DAG.getCondCode(ISD::SETEQ),                       DAG.getUNDEF(MaskVT), Mask, VL}); -    return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, SelectCond, SplattedVal, -                       Vec, VL); +    return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, SelectCond, SplattedVal, +                       Vec, DAG.getUNDEF(VT), VL);    }    // EGS * EEW >= 128 bits    case Intrinsic::riscv_vaesdf_vv: @@ -10243,8 +10250,8 @@ SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV(    SDLoc DL(Op);    SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; -  SDValue Select = -      DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC, Op1, Op2, VL); +  SDValue Select = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, Op1, +                               Op2, DAG.getUNDEF(ContainerVT), VL);    return convertFromScalableVector(VT, Select, DAG, Subtarget);  } @@ -10327,9 +10334,14 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {            Ops.push_back(DAG.getUNDEF(ContainerVT));        } else if (ISD::getVPExplicitVectorLengthIdx(Op.getOpcode()) ==                   OpIdx.index()) { -        // For VP_MERGE, copy the false operand instead of an undef value. -        assert(Op.getOpcode() == ISD::VP_MERGE); -        Ops.push_back(Ops.back()); +        if (Op.getOpcode() == ISD::VP_MERGE) { +          // For VP_MERGE, copy the false operand instead of an undef value. +          Ops.push_back(Ops.back()); +        } else { +          assert(Op.getOpcode() == ISD::VP_SELECT); +          // For VP_SELECT, add an undef value. +          Ops.push_back(DAG.getUNDEF(ContainerVT)); +        }        }      }      // Pass through operands which aren't fixed-length vectors. @@ -10379,8 +10391,8 @@ SDValue RISCVTargetLowering::lowerVPExtMaskOp(SDValue Op,    SDValue Splat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,                                DAG.getUNDEF(ContainerVT), SplatValue, VL); -  SDValue Result = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Src, -                               Splat, ZeroSplat, VL); +  SDValue Result = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Src, Splat, +                               ZeroSplat, DAG.getUNDEF(ContainerVT), VL);    if (!VT.isFixedLengthVector())      return Result;    return convertFromScalableVector(VT, Result, DAG, Subtarget); @@ -10508,8 +10520,8 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,              RISCVISDExtOpc == RISCVISD::VZEXT_VL ? 1 : -1, DL, XLenVT);          SDValue OneSplat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT,                                         DAG.getUNDEF(IntVT), One, VL); -        Src = DAG.getNode(RISCVISD::VSELECT_VL, DL, IntVT, Src, OneSplat, -                          ZeroSplat, VL); +        Src = DAG.getNode(RISCVISD::VMERGE_VL, DL, IntVT, Src, OneSplat, +                          ZeroSplat, DAG.getUNDEF(IntVT), VL);        } else if (DstEltSize > (2 * SrcEltSize)) {          // Widen before converting.          MVT IntVT = MVT::getVectorVT(MVT::getIntegerVT(DstEltSize / 2), @@ -10633,8 +10645,8 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,      SDValue SplatZeroOp1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,                                         DAG.getUNDEF(ContainerVT),                                         DAG.getConstant(0, DL, XLenVT), EVL1); -    Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op1, SplatOneOp1, -                      SplatZeroOp1, EVL1); +    Op1 = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Op1, SplatOneOp1, +                      SplatZeroOp1, DAG.getUNDEF(ContainerVT), EVL1);      SDValue SplatOneOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,                                        DAG.getUNDEF(ContainerVT), @@ -10642,8 +10654,8 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,      SDValue SplatZeroOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,                                         DAG.getUNDEF(ContainerVT),                                         DAG.getConstant(0, DL, XLenVT), EVL2); -    Op2 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op2, SplatOneOp2, -                      SplatZeroOp2, EVL2); +    Op2 = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Op2, SplatOneOp2, +                      SplatZeroOp2, DAG.getUNDEF(ContainerVT), EVL2);    }    int64_t ImmValue = cast<ConstantSDNode>(Offset)->getSExtValue(); @@ -10713,8 +10725,8 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op,      SDValue SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IndicesVT,                                      DAG.getUNDEF(IndicesVT),                                      DAG.getConstant(0, DL, XLenVT), EVL); -    Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, IndicesVT, Op1, SplatOne, -                      SplatZero, EVL); +    Op1 = DAG.getNode(RISCVISD::VMERGE_VL, DL, IndicesVT, Op1, SplatOne, +                      SplatZero, DAG.getUNDEF(IndicesVT), EVL);    }    unsigned EltSize = GatherVT.getScalarSizeInBits(); @@ -12197,7 +12209,7 @@ static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,    if (VT.isVector())      return SDValue(); -  if (!Subtarget.hasShortForwardBranchOpt()) { +  if (!Subtarget.hasConditionalMoveFusion()) {      // (select cond, x, (and x, c)) has custom lowering with Zicond.      if ((!Subtarget.hasStdExtZicond() &&           !Subtarget.hasVendorXVentanaCondOps()) || @@ -12850,9 +12862,9 @@ struct CombineResult;  /// Helper class for folding sign/zero extensions.  /// In particular, this class is used for the following combines: -/// add | add_vl -> vwadd(u) | vwadd(u)_w -/// sub | sub_vl -> vwsub(u) | vwsub(u)_w -/// mul | mul_vl -> vwmul(u) | vwmul_su +/// add_vl -> vwadd(u) | vwadd(u)_w +/// sub_vl -> vwsub(u) | vwsub(u)_w +/// mul_vl -> vwmul(u) | vwmul_su  ///  /// An object of this class represents an operand of the operation we want to  /// combine. @@ -12897,8 +12909,6 @@ struct NodeExtensionHelper {    /// E.g., for zext(a), this would return a.    SDValue getSource() const {      switch (OrigOperand.getOpcode()) { -    case ISD::ZERO_EXTEND: -    case ISD::SIGN_EXTEND:      case RISCVISD::VSEXT_VL:      case RISCVISD::VZEXT_VL:        return OrigOperand.getOperand(0); @@ -12915,8 +12925,7 @@ struct NodeExtensionHelper {    /// Get or create a value that can feed \p Root with the given extension \p    /// SExt. If \p SExt is std::nullopt, this returns the source of this operand.    /// \see ::getSource(). -  SDValue getOrCreateExtendedOp(SDNode *Root, SelectionDAG &DAG, -                                const RISCVSubtarget &Subtarget, +  SDValue getOrCreateExtendedOp(const SDNode *Root, SelectionDAG &DAG,                                  std::optional<bool> SExt) const {      if (!SExt.has_value())        return OrigOperand; @@ -12931,10 +12940,8 @@ struct NodeExtensionHelper {      // If we need an extension, we should be changing the type.      SDLoc DL(Root); -    auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget); +    auto [Mask, VL] = getMaskAndVL(Root);      switch (OrigOperand.getOpcode()) { -    case ISD::ZERO_EXTEND: -    case ISD::SIGN_EXTEND:      case RISCVISD::VSEXT_VL:      case RISCVISD::VZEXT_VL:        return DAG.getNode(ExtOpc, DL, NarrowVT, Source, Mask, VL); @@ -12974,15 +12981,12 @@ struct NodeExtensionHelper {    /// \pre \p Opcode represents a supported root (\see ::isSupportedRoot()).    static unsigned getSameExtensionOpcode(unsigned Opcode, bool IsSExt) {      switch (Opcode) { -    case ISD::ADD:      case RISCVISD::ADD_VL:      case RISCVISD::VWADD_W_VL:      case RISCVISD::VWADDU_W_VL:        return IsSExt ? RISCVISD::VWADD_VL : RISCVISD::VWADDU_VL; -    case ISD::MUL:      case RISCVISD::MUL_VL:        return IsSExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL; -    case ISD::SUB:      case RISCVISD::SUB_VL:      case RISCVISD::VWSUB_W_VL:      case RISCVISD::VWSUBU_W_VL: @@ -12995,8 +12999,7 @@ struct NodeExtensionHelper {    /// Get the opcode to materialize \p Opcode(sext(a), zext(b)) ->    /// newOpcode(a, b).    static unsigned getSUOpcode(unsigned Opcode) { -    assert((Opcode == RISCVISD::MUL_VL || Opcode == ISD::MUL) && -           "SU is only supported for MUL"); +    assert(Opcode == RISCVISD::MUL_VL && "SU is only supported for MUL");      return RISCVISD::VWMULSU_VL;    } @@ -13004,10 +13007,8 @@ struct NodeExtensionHelper {    /// newOpcode(a, b).    static unsigned getWOpcode(unsigned Opcode, bool IsSExt) {      switch (Opcode) { -    case ISD::ADD:      case RISCVISD::ADD_VL:        return IsSExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL; -    case ISD::SUB:      case RISCVISD::SUB_VL:        return IsSExt ? RISCVISD::VWSUB_W_VL : RISCVISD::VWSUBU_W_VL;      default: @@ -13017,33 +13018,19 @@ struct NodeExtensionHelper {    using CombineToTry = std::function<std::optional<CombineResult>(        SDNode * /*Root*/, const NodeExtensionHelper & /*LHS*/, -      const NodeExtensionHelper & /*RHS*/, SelectionDAG &, -      const RISCVSubtarget &)>; +      const NodeExtensionHelper & /*RHS*/)>;    /// Check if this node needs to be fully folded or extended for all users.    bool needToPromoteOtherUsers() const { return EnforceOneUse; }    /// Helper method to set the various fields of this struct based on the    /// type of \p Root. -  void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG, -                              const RISCVSubtarget &Subtarget) { +  void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG) {      SupportsZExt = false;      SupportsSExt = false;      EnforceOneUse = true;      CheckMask = true; -    unsigned Opc = OrigOperand.getOpcode(); -    switch (Opc) { -    case ISD::ZERO_EXTEND: -    case ISD::SIGN_EXTEND: { -      if (OrigOperand.getValueType().isVector()) { -        SupportsZExt = Opc == ISD::ZERO_EXTEND; -        SupportsSExt = Opc == ISD::SIGN_EXTEND; -        SDLoc DL(Root); -        MVT VT = Root->getSimpleValueType(0); -        std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget); -      } -      break; -    } +    switch (OrigOperand.getOpcode()) {      case RISCVISD::VZEXT_VL:        SupportsZExt = true;        Mask = OrigOperand.getOperand(1); @@ -13099,16 +13086,8 @@ struct NodeExtensionHelper {    }    /// Check if \p Root supports any extension folding combines. -  static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) { +  static bool isSupportedRoot(const SDNode *Root) {      switch (Root->getOpcode()) { -    case ISD::ADD: -    case ISD::SUB: -    case ISD::MUL: { -      const TargetLowering &TLI = DAG.getTargetLoweringInfo(); -      if (!TLI.isTypeLegal(Root->getValueType(0))) -        return false; -      return Root->getValueType(0).isScalableVector(); -    }      case RISCVISD::ADD_VL:      case RISCVISD::MUL_VL:      case RISCVISD::VWADD_W_VL: @@ -13123,10 +13102,9 @@ struct NodeExtensionHelper {    }    /// Build a NodeExtensionHelper for \p Root.getOperand(\p OperandIdx). -  NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG, -                      const RISCVSubtarget &Subtarget) { -    assert(isSupportedRoot(Root, DAG) && "Trying to build an helper with an " -                                         "unsupported root"); +  NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG) { +    assert(isSupportedRoot(Root) && "Trying to build an helper with an " +                                    "unsupported root");      assert(OperandIdx < 2 && "Requesting something else than LHS or RHS");      OrigOperand = Root->getOperand(OperandIdx); @@ -13142,7 +13120,7 @@ struct NodeExtensionHelper {          SupportsZExt =              Opc == RISCVISD::VWADDU_W_VL || Opc == RISCVISD::VWSUBU_W_VL;          SupportsSExt = !SupportsZExt; -        std::tie(Mask, VL) = getMaskAndVL(Root, DAG, Subtarget); +        std::tie(Mask, VL) = getMaskAndVL(Root);          CheckMask = true;          // There's no existing extension here, so we don't have to worry about          // making sure it gets removed. @@ -13151,7 +13129,7 @@ struct NodeExtensionHelper {        }        [[fallthrough]];      default: -      fillUpExtensionSupport(Root, DAG, Subtarget); +      fillUpExtensionSupport(Root, DAG);        break;      }    } @@ -13167,27 +13145,14 @@ struct NodeExtensionHelper {    }    /// Helper function to get the Mask and VL from \p Root. -  static std::pair<SDValue, SDValue> -  getMaskAndVL(const SDNode *Root, SelectionDAG &DAG, -               const RISCVSubtarget &Subtarget) { -    assert(isSupportedRoot(Root, DAG) && "Unexpected root"); -    switch (Root->getOpcode()) { -    case ISD::ADD: -    case ISD::SUB: -    case ISD::MUL: { -      SDLoc DL(Root); -      MVT VT = Root->getSimpleValueType(0); -      return getDefaultScalableVLOps(VT, DL, DAG, Subtarget); -    } -    default: -      return std::make_pair(Root->getOperand(3), Root->getOperand(4)); -    } +  static std::pair<SDValue, SDValue> getMaskAndVL(const SDNode *Root) { +    assert(isSupportedRoot(Root) && "Unexpected root"); +    return std::make_pair(Root->getOperand(3), Root->getOperand(4));    }    /// Check if the Mask and VL of this operand are compatible with \p Root. -  bool areVLAndMaskCompatible(SDNode *Root, SelectionDAG &DAG, -                              const RISCVSubtarget &Subtarget) const { -    auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget); +  bool areVLAndMaskCompatible(const SDNode *Root) const { +    auto [Mask, VL] = getMaskAndVL(Root);      return isMaskCompatible(Mask) && isVLCompatible(VL);    } @@ -13195,14 +13160,11 @@ struct NodeExtensionHelper {    /// foldings that are supported by this class.    static bool isCommutative(const SDNode *N) {      switch (N->getOpcode()) { -    case ISD::ADD: -    case ISD::MUL:      case RISCVISD::ADD_VL:      case RISCVISD::MUL_VL:      case RISCVISD::VWADD_W_VL:      case RISCVISD::VWADDU_W_VL:        return true; -    case ISD::SUB:      case RISCVISD::SUB_VL:      case RISCVISD::VWSUB_W_VL:      case RISCVISD::VWSUBU_W_VL: @@ -13247,25 +13209,14 @@ struct CombineResult {    /// Return a value that uses TargetOpcode and that can be used to replace    /// Root.    /// The actual replacement is *not* done in that method. -  SDValue materialize(SelectionDAG &DAG, -                      const RISCVSubtarget &Subtarget) const { +  SDValue materialize(SelectionDAG &DAG) const {      SDValue Mask, VL, Merge; -    std::tie(Mask, VL) = -        NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget); -    switch (Root->getOpcode()) { -    default: -      Merge = Root->getOperand(2); -      break; -    case ISD::ADD: -    case ISD::SUB: -    case ISD::MUL: -      Merge = DAG.getUNDEF(Root->getValueType(0)); -      break; -    } +    std::tie(Mask, VL) = NodeExtensionHelper::getMaskAndVL(Root); +    Merge = Root->getOperand(2);      return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0), -                       LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtLHS), -                       RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtRHS), -                       Merge, Mask, VL); +                       LHS.getOrCreateExtendedOp(Root, DAG, SExtLHS), +                       RHS.getOrCreateExtendedOp(Root, DAG, SExtRHS), Merge, +                       Mask, VL);    }  }; @@ -13282,16 +13233,15 @@ struct CombineResult {  static std::optional<CombineResult>  canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,                                   const NodeExtensionHelper &RHS, bool AllowSExt, -                                 bool AllowZExt, SelectionDAG &DAG, -                                 const RISCVSubtarget &Subtarget) { +                                 bool AllowZExt) {    assert((AllowSExt || AllowZExt) && "Forgot to set what you want?"); -  if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) || -      !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) +  if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))      return std::nullopt;    if (AllowZExt && LHS.SupportsZExt && RHS.SupportsZExt)      return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(                               Root->getOpcode(), /*IsSExt=*/false), -                         Root, LHS, /*SExtLHS=*/false, RHS, /*SExtRHS=*/false); +                         Root, LHS, /*SExtLHS=*/false, RHS, +                         /*SExtRHS=*/false);    if (AllowSExt && LHS.SupportsSExt && RHS.SupportsSExt)      return CombineResult(NodeExtensionHelper::getSameExtensionOpcode(                               Root->getOpcode(), /*IsSExt=*/true), @@ -13308,10 +13258,9 @@ canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS,  /// can be used to apply the pattern.  static std::optional<CombineResult>  canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS, -                             const NodeExtensionHelper &RHS, SelectionDAG &DAG, -                             const RISCVSubtarget &Subtarget) { +                             const NodeExtensionHelper &RHS) {    return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true, -                                          /*AllowZExt=*/true, DAG, Subtarget); +                                          /*AllowZExt=*/true);  }  /// Check if \p Root follows a pattern Root(LHS, ext(RHS)) @@ -13320,9 +13269,8 @@ canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS,  /// can be used to apply the pattern.  static std::optional<CombineResult>  canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS, -              const NodeExtensionHelper &RHS, SelectionDAG &DAG, -              const RISCVSubtarget &Subtarget) { -  if (!RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) +              const NodeExtensionHelper &RHS) { +  if (!RHS.areVLAndMaskCompatible(Root))      return std::nullopt;    // FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar @@ -13346,10 +13294,9 @@ canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS,  /// can be used to apply the pattern.  static std::optional<CombineResult>  canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS, -                    const NodeExtensionHelper &RHS, SelectionDAG &DAG, -                    const RISCVSubtarget &Subtarget) { +                    const NodeExtensionHelper &RHS) {    return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true, -                                          /*AllowZExt=*/false, DAG, Subtarget); +                                          /*AllowZExt=*/false);  }  /// Check if \p Root follows a pattern Root(zext(LHS), zext(RHS)) @@ -13358,10 +13305,9 @@ canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS,  /// can be used to apply the pattern.  static std::optional<CombineResult>  canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS, -                    const NodeExtensionHelper &RHS, SelectionDAG &DAG, -                    const RISCVSubtarget &Subtarget) { +                    const NodeExtensionHelper &RHS) {    return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/false, -                                          /*AllowZExt=*/true, DAG, Subtarget); +                                          /*AllowZExt=*/true);  }  /// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS)) @@ -13370,13 +13316,10 @@ canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS,  /// can be used to apply the pattern.  static std::optional<CombineResult>  canFoldToVW_SU(SDNode *Root, const NodeExtensionHelper &LHS, -               const NodeExtensionHelper &RHS, SelectionDAG &DAG, -               const RISCVSubtarget &Subtarget) { - +               const NodeExtensionHelper &RHS) {    if (!LHS.SupportsSExt || !RHS.SupportsZExt)      return std::nullopt; -  if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) || -      !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) +  if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root))      return std::nullopt;    return CombineResult(NodeExtensionHelper::getSUOpcode(Root->getOpcode()),                         Root, LHS, /*SExtLHS=*/true, RHS, /*SExtRHS=*/false); @@ -13386,8 +13329,6 @@ SmallVector<NodeExtensionHelper::CombineToTry>  NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {    SmallVector<CombineToTry> Strategies;    switch (Root->getOpcode()) { -  case ISD::ADD: -  case ISD::SUB:    case RISCVISD::ADD_VL:    case RISCVISD::SUB_VL:      // add|sub -> vwadd(u)|vwsub(u) @@ -13395,7 +13336,6 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {      // add|sub -> vwadd(u)_w|vwsub(u)_w      Strategies.push_back(canFoldToVW_W);      break; -  case ISD::MUL:    case RISCVISD::MUL_VL:      // mul -> vwmul(u)      Strategies.push_back(canFoldToVWWithSameExtension); @@ -13426,14 +13366,12 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) {  /// mul_vl -> vwmul(u) | vwmul_su  /// vwadd_w(u) -> vwadd(u)  /// vwub_w(u) -> vwadd(u) -static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, -                                           TargetLowering::DAGCombinerInfo &DCI, -                                           const RISCVSubtarget &Subtarget) { +static SDValue +combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {    SelectionDAG &DAG = DCI.DAG; -  if (!NodeExtensionHelper::isSupportedRoot(N, DAG)) -    return SDValue(); - +  assert(NodeExtensionHelper::isSupportedRoot(N) && +         "Shouldn't have called this method");    SmallVector<SDNode *> Worklist;    SmallSet<SDNode *, 8> Inserted;    Worklist.push_back(N); @@ -13442,11 +13380,11 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,    while (!Worklist.empty()) {      SDNode *Root = Worklist.pop_back_val(); -    if (!NodeExtensionHelper::isSupportedRoot(Root, DAG)) +    if (!NodeExtensionHelper::isSupportedRoot(Root))        return SDValue(); -    NodeExtensionHelper LHS(N, 0, DAG, Subtarget); -    NodeExtensionHelper RHS(N, 1, DAG, Subtarget); +    NodeExtensionHelper LHS(N, 0, DAG); +    NodeExtensionHelper RHS(N, 1, DAG);      auto AppendUsersIfNeeded = [&Worklist,                                  &Inserted](const NodeExtensionHelper &Op) {        if (Op.needToPromoteOtherUsers()) { @@ -13473,8 +13411,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,        for (NodeExtensionHelper::CombineToTry FoldingStrategy :             FoldingStrategies) { -        std::optional<CombineResult> Res = -            FoldingStrategy(N, LHS, RHS, DAG, Subtarget); +        std::optional<CombineResult> Res = FoldingStrategy(N, LHS, RHS);          if (Res) {            Matched = true;            CombinesToApply.push_back(*Res); @@ -13503,7 +13440,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N,    SmallVector<std::pair<SDValue, SDValue>> ValuesToReplace;    ValuesToReplace.reserve(CombinesToApply.size());    for (CombineResult Res : CombinesToApply) { -    SDValue NewValue = Res.materialize(DAG, Subtarget); +    SDValue NewValue = Res.materialize(DAG);      if (!InputRootReplacement) {        assert(Res.Root == N &&               "First element is expected to be the current node"); @@ -14503,7 +14440,7 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,    if (SDValue V = useInversedSetcc(N, DAG, Subtarget))      return V; -  if (Subtarget.hasShortForwardBranchOpt()) +  if (Subtarget.hasConditionalMoveFusion())      return SDValue();    SDValue TrueVal = N->getOperand(1); @@ -14775,20 +14712,13 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,  static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,                                 const RISCVSubtarget &Subtarget) { - -  assert(N->getOpcode() == RISCVISD::ADD_VL || N->getOpcode() == ISD::ADD); - -  if (N->getValueType(0).isFixedLengthVector()) -    return SDValue(); - +  assert(N->getOpcode() == RISCVISD::ADD_VL);    SDValue Addend = N->getOperand(0);    SDValue MulOp = N->getOperand(1); +  SDValue AddMergeOp = N->getOperand(2); -  if (N->getOpcode() == RISCVISD::ADD_VL) { -    SDValue AddMergeOp = N->getOperand(2); -    if (!AddMergeOp.isUndef()) -      return SDValue(); -  } +  if (!AddMergeOp.isUndef()) +    return SDValue();    auto IsVWMulOpc = [](unsigned Opc) {      switch (Opc) { @@ -14812,16 +14742,8 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,    if (!MulMergeOp.isUndef())      return SDValue(); -  auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG, -                             const RISCVSubtarget &Subtarget) { -    if (N->getOpcode() == ISD::ADD) { -      SDLoc DL(N); -      return getDefaultScalableVLOps(N->getSimpleValueType(0), DL, DAG, -                                     Subtarget); -    } -    return std::make_pair(N->getOperand(3), N->getOperand(4)); -  }(N, DAG, Subtarget); - +  SDValue AddMask = N->getOperand(3); +  SDValue AddVL = N->getOperand(4);    SDValue MulMask = MulOp.getOperand(3);    SDValue MulVL = MulOp.getOperand(4); @@ -15087,18 +15009,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,      return DAG.getNode(ISD::AND, DL, VT, NewFMV,                         DAG.getConstant(~SignBit, DL, VT));    } -  case ISD::ADD: { -    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) -      return V; -    if (SDValue V = combineToVWMACC(N, DAG, Subtarget)) -      return V; +  case ISD::ADD:      return performADDCombine(N, DAG, Subtarget); -  } -  case ISD::SUB: { -    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) -      return V; +  case ISD::SUB:      return performSUBCombine(N, DAG, Subtarget); -  }    case ISD::AND:      return performANDCombine(N, DCI, Subtarget);    case ISD::OR: @@ -15106,8 +15020,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,    case ISD::XOR:      return performXORCombine(N, DAG, Subtarget);    case ISD::MUL: -    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) -      return V;      return performMULCombine(N, DAG);    case ISD::FADD:    case ISD::UMAX: @@ -15266,7 +15178,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,        return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),                           {LHS, RHS, CC, TrueV, FalseV}); -    if (!Subtarget.hasShortForwardBranchOpt()) { +    if (!Subtarget.hasConditionalMoveFusion()) {        // (select c, -1, y) -> -c | y        if (isAllOnesConstant(TrueV)) {          SDValue C = DAG.getSetCC(DL, VT, LHS, RHS, CCVal); @@ -15584,7 +15496,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,      break;    }    case RISCVISD::ADD_VL: -    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) +    if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI))        return V;      return combineToVWMACC(N, DAG, Subtarget);    case RISCVISD::SUB_VL: @@ -15593,7 +15505,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,    case RISCVISD::VWSUB_W_VL:    case RISCVISD::VWSUBU_W_VL:    case RISCVISD::MUL_VL: -    return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget); +    return combineBinOp_VLToVWBinOp_VL(N, DCI);    case RISCVISD::VFMADD_VL:    case RISCVISD::VFNMADD_VL:    case RISCVISD::VFMSUB_VL: @@ -18303,20 +18215,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,    // split it and then direct call can be matched by PseudoCALL.    if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {      const GlobalValue *GV = S->getGlobal(); - -    unsigned OpFlags = RISCVII::MO_CALL; -    if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) -      OpFlags = RISCVII::MO_PLT; - -    Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); +    Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, RISCVII::MO_CALL);    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { -    unsigned OpFlags = RISCVII::MO_CALL; - -    if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(), -                                                 nullptr)) -      OpFlags = RISCVII::MO_PLT; - -    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags); +    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, RISCVII::MO_CALL);    }    // The first call operand is the chain and the second is the target address. @@ -18694,6 +18595,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {    NODE_NAME_CASE(UDIV_VL)    NODE_NAME_CASE(UREM_VL)    NODE_NAME_CASE(XOR_VL) +  NODE_NAME_CASE(AVGFLOORU_VL)    NODE_NAME_CASE(SADDSAT_VL)    NODE_NAME_CASE(UADDSAT_VL)    NODE_NAME_CASE(SSUBSAT_VL) @@ -18783,7 +18685,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {    NODE_NAME_CASE(VWMACCSU_VL)    NODE_NAME_CASE(VNSRL_VL)    NODE_NAME_CASE(SETCC_VL) -  NODE_NAME_CASE(VSELECT_VL)    NODE_NAME_CASE(VMERGE_VL)    NODE_NAME_CASE(VMAND_VL)    NODE_NAME_CASE(VMOR_VL) @@ -19357,7 +19258,6 @@ bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const {  bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,                                                   SDValue &Offset,                                                   ISD::MemIndexedMode &AM, -                                                 bool &IsInc,                                                   SelectionDAG &DAG) const {    // Target does not support indexed loads.    if (!Subtarget.hasVendorXTHeadMemIdx()) @@ -19384,7 +19284,6 @@ bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,      if (!isLegalIndexedOffset)        return false; -    IsInc = (Op->getOpcode() == ISD::ADD);      Offset = Op->getOperand(1);      return true;    } @@ -19407,11 +19306,10 @@ bool RISCVTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,    } else      return false; -  bool IsInc; -  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) +  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, DAG))      return false; -  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; +  AM = ISD::PRE_INC;    return true;  } @@ -19431,15 +19329,14 @@ bool RISCVTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,    } else      return false; -  bool IsInc; -  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) +  if (!getIndexedAddressParts(Op, Base, Offset, AM, DAG))      return false;    // Post-indexing updates the base, so it's not a valid transform    // if that's not the same as the load's pointer.    if (Ptr != Base)      return false; -  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; +  AM = ISD::POST_INC;    return true;  } diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h index 58ed611efc83..5d51fe168b04 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -253,6 +253,9 @@ enum NodeType : unsigned {    SSUBSAT_VL,    USUBSAT_VL, +  // Averaging adds of unsigned integers. +  AVGFLOORU_VL, +    MULHS_VL,    MULHU_VL,    FADD_VL, @@ -330,9 +333,8 @@ enum NodeType : unsigned {    // operand is VL.    SETCC_VL, -  // Vector select with an additional VL operand. This operation is unmasked. -  VSELECT_VL,    // General vmerge node with mask, true, false, passthru, and vl operands. +  // Tail agnostic vselect can be implemented by setting passthru to undef.    VMERGE_VL,    // Mask binary operators. @@ -526,7 +528,8 @@ public:    InstructionCost getVRGatherVVCost(MVT VT) const;    InstructionCost getVRGatherVICost(MVT VT) const; -  InstructionCost getVSlideCost(MVT VT) const; +  InstructionCost getVSlideVXCost(MVT VT) const; +  InstructionCost getVSlideVICost(MVT VT) const;    // Provide custom lowering hooks for some operations.    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -774,8 +777,7 @@ public:    bool isVScaleKnownToBeAPowerOfTwo() const override;    bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, -                              ISD::MemIndexedMode &AM, bool &IsInc, -                              SelectionDAG &DAG) const; +                              ISD::MemIndexedMode &AM, SelectionDAG &DAG) const;    bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,                                   ISD::MemIndexedMode &AM,                                   SelectionDAG &DAG) const override; @@ -903,6 +905,7 @@ private:    SDValue lowerFixedLengthVectorSelectToRVV(SDValue Op,                                              SelectionDAG &DAG) const;    SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; +  SDValue lowerUnsignedAvgFloor(SDValue Op, SelectionDAG &DAG) const;    SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;    SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG) const;    SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG) const; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 3400b24e0abb..e591aa935c0b 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1381,6 +1381,11 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) {    if (!UnavailablePred || !AvailableInfo.isValid())      return; +  // If we don't know the exact VTYPE, we can't copy the vsetvli to the exit of +  // the unavailable pred. +  if (AvailableInfo.hasSEWLMULRatioOnly()) +    return; +    // Critical edge - TODO: consider splitting?    if (UnavailablePred->succ_size() != 1)      return; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index cd98438eed88..351f48c1708e 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1346,6 +1346,10 @@ unsigned getPredicatedOpcode(unsigned Opcode) {    case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break;    case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break;    case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break; + +  case RISCV::ANDN:  return RISCV::PseudoCCANDN;  break; +  case RISCV::ORN:   return RISCV::PseudoCCORN;   break; +  case RISCV::XNOR:  return RISCV::PseudoCCXNOR;  break;    }    return RISCV::INSTRUCTION_LIST_END; @@ -2365,7 +2369,6 @@ RISCVInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {    using namespace RISCVII;    static const std::pair<unsigned, const char *> TargetFlags[] = {        {MO_CALL, "riscv-call"}, -      {MO_PLT, "riscv-plt"},        {MO_LO, "riscv-lo"},        {MO_HI, "riscv-hi"},        {MO_PCREL_LO, "riscv-pcrel-lo"}, @@ -2651,6 +2654,7 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,    case RISCV::TH_MULSH:      // Operands 2 and 3 are commutable.      return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); +  case RISCV::PseudoCCMOVGPRNoX0:    case RISCV::PseudoCCMOVGPR:      // Operands 4 and 5 are commutable.      return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 4, 5); @@ -2807,6 +2811,7 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1,                                                     OpIdx2);    } +  case RISCV::PseudoCCMOVGPRNoX0:    case RISCV::PseudoCCMOVGPR: {      // CCMOV can be commuted by inverting the condition.      auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm()); diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 35e8edf5d2fa..792e0bbdf581 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -729,22 +729,6 @@ def UNIMP : RVInstI<0b001, OPC_SYSTEM, (outs), (ins), "unimp", "">,    let imm12 = 0b110000000000;  } -let Predicates = [HasStdExtZawrs] in { -def WRS_NTO : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "wrs.nto", "">, -              Sched<[]> { -  let rs1 = 0; -  let rd = 0; -  let imm12 = 0b000000001101; -} - -def WRS_STO : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "wrs.sto", "">, -              Sched<[]> { -  let rs1 = 0; -  let rd = 0; -  let imm12 = 0b000000011101; -} -} // Predicates = [HasStdExtZawrs] -  } // hasSideEffects = 1, mayLoad = 0, mayStore = 0  def CSRRW : CSR_ir<0b001, "csrrw">; @@ -1387,6 +1371,24 @@ def PseudoCCMOVGPR : Pseudo<(outs GPR:$dst),                              ReadSFBALU, ReadSFBALU]>;  } +// This should always expand to a branch+c.mv so the size is 6 or 4 if the +// branch is compressible. +let Predicates = [HasConditionalMoveFusion, NoShortForwardBranchOpt], +    Constraints = "$dst = $falsev", isCommutable = 1, Size = 6 in { +// This instruction moves $truev to $dst when the condition is true. It will +// be expanded to control flow in RISCVExpandPseudoInsts. +// We use GPRNoX0 because c.mv cannot encode X0. +def PseudoCCMOVGPRNoX0 : Pseudo<(outs GPRNoX0:$dst), +                                (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, +                                 GPRNoX0:$falsev, GPRNoX0:$truev), +                                [(set GPRNoX0:$dst, +                                  (riscv_selectcc_frag:$cc (XLenVT GPR:$lhs), +                                                           (XLenVT GPR:$rhs), +                                                           cond, (XLenVT GPRNoX0:$truev), +                                                           (XLenVT GPRNoX0:$falsev)))]>, +                         Sched<[]>; +} +  // Conditional binops, that updates update $dst to (op rs1, rs2) when condition  // is true. Returns $falsev otherwise. Selected by optimizeSelect.  // TODO: Can we use DefaultOperands on the regular binop to accomplish this more @@ -1517,6 +1519,23 @@ def PseudoCCSRAIW : Pseudo<(outs GPR:$dst),                              GPR:$falsev, GPR:$rs1, simm12:$rs2), []>,                      Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU,                             ReadSFBALU]>; + +// Zbb/Zbkb instructions +def PseudoCCANDN : Pseudo<(outs GPR:$dst), +                          (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, +                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, +                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, +                          ReadSFBALU, ReadSFBALU, ReadSFBALU]>; +def PseudoCCORN : Pseudo<(outs GPR:$dst), +                         (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, +                          GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, +                  Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, +                         ReadSFBALU, ReadSFBALU, ReadSFBALU]>; +def PseudoCCXNOR : Pseudo<(outs GPR:$dst), +                          (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, +                           GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, +                   Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, +                          ReadSFBALU, ReadSFBALU, ReadSFBALU]>;  }  multiclass SelectCC_GPR_rrirr<DAGOperand valty, ValueType vt> { @@ -1535,7 +1554,7 @@ multiclass SelectCC_GPR_rrirr<DAGOperand valty, ValueType vt> {               (IntCCtoRISCVCC $cc), valty:$truev, valty:$falsev)>;  } -let Predicates = [NoShortForwardBranchOpt] in +let Predicates = [NoConditionalMoveFusion] in  defm Select_GPR : SelectCC_GPR_rrirr<GPR, XLenVT>;  class SelectCompressOpt<CondCode Cond> @@ -2095,6 +2114,7 @@ include "RISCVInstrInfoM.td"  // Atomic  include "RISCVInstrInfoA.td" +include "RISCVInstrInfoZa.td"  // Scalar FP  include "RISCVInstrInfoF.td" diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index c8301fcc6b93..4d0567e41abc 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -7,8 +7,7 @@  //===----------------------------------------------------------------------===//  //  // This file describes the RISC-V instructions from the standard 'A', Atomic -// Instructions extension as well as the experimental 'Zacas' (Atomic -// Compare-and-Swap) extension. +// Instructions extension.  //  //===----------------------------------------------------------------------===// @@ -96,15 +95,6 @@ defm AMOMAXU_D  : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">,                    Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;  } // Predicates = [HasStdExtA, IsRV64] -let Predicates = [HasStdExtZacas] in { -defm AMOCAS_W : AMO_rr_aq_rl<0b00101, 0b010, "amocas.w">; -defm AMOCAS_D : AMO_rr_aq_rl<0b00101, 0b011, "amocas.d">; -} // Predicates = [HasStdExtZacas] - -let Predicates = [HasStdExtZacas, IsRV64] in { -defm AMOCAS_Q : AMO_rr_aq_rl<0b00101, 0b100, "amocas.q">; -} // Predicates = [HasStdExtZacas, IsRV64] -  //===----------------------------------------------------------------------===//  // Pseudo-instructions and codegen patterns  //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 6af710049a9d..418421b2a556 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -36,11 +36,13 @@ def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmINX">;  def GPRPF64AsFPR : AsmOperandClass {    let Name = "GPRPF64AsFPR";    let ParserMethod = "parseGPRAsFPR"; +  let PredicateMethod = "isGPRAsFPR";    let RenderMethod = "addRegOperands";  }  def GPRF64AsFPR : AsmOperandClass {    let Name = "GPRF64AsFPR"; +  let PredicateMethod = "isGPRAsFPR";    let ParserMethod = "parseGPRAsFPR";    let RenderMethod = "addRegOperands";  } diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 30deeaa06448..fcb18b67623e 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -6719,12 +6719,14 @@ defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">;  // 15.2. Vector mask population count vcpop  //===----------------------------------------------------------------------===// +let IsSignExtendingOpW = 1 in  defm PseudoVCPOP: VPseudoVPOP_M;  //===----------------------------------------------------------------------===//  // 15.3. vfirst find-first-set mask bit  //===----------------------------------------------------------------------===// +let IsSignExtendingOpW = 1 in  defm PseudoVFIRST: VPseudoV1ST_M;  //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index b7c845703794..4f87c36506e5 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -1131,6 +1131,22 @@ defm : VPatBinarySDNode_VV_VX_VI<uaddsat, "PseudoVSADDU">;  defm : VPatBinarySDNode_VV_VX<ssubsat, "PseudoVSSUB">;  defm : VPatBinarySDNode_VV_VX<usubsat, "PseudoVSSUBU">; +// 12.2. Vector Single-Width Averaging Add and Subtract +foreach vti = AllIntegerVectors in { +  let Predicates = GetVTypePredicates<vti>.Predicates in { +    def : Pat<(avgflooru (vti.Vector vti.RegClass:$rs1), +                         (vti.Vector vti.RegClass:$rs2)), +              (!cast<Instruction>("PseudoVAADDU_VV_"#vti.LMul.MX) +                (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs2, +                0b10, vti.AVL, vti.Log2SEW, TA_MA)>; +    def : Pat<(avgflooru (vti.Vector vti.RegClass:$rs1), +                         (vti.Vector (SplatPat (XLenVT GPR:$rs2)))), +              (!cast<Instruction>("PseudoVAADDU_VX_"#vti.LMul.MX) +                (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, GPR:$rs2, +                0b10, vti.AVL, vti.Log2SEW, TA_MA)>; +  } +} +  // 15. Vector Mask Instructions  // 15.1. Vector Mask-Register Logical Instructions diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 5b50a4a78c01..d60ff4b5fab0 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -111,6 +111,7 @@ def riscv_ctlz_vl       : SDNode<"RISCVISD::CTLZ_VL",       SDT_RISCVIntUnOp_VL>  def riscv_cttz_vl       : SDNode<"RISCVISD::CTTZ_VL",       SDT_RISCVIntUnOp_VL>;  def riscv_ctpop_vl      : SDNode<"RISCVISD::CTPOP_VL",      SDT_RISCVIntUnOp_VL>; +def riscv_avgflooru_vl  : SDNode<"RISCVISD::AVGFLOORU_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;  def riscv_saddsat_vl   : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;  def riscv_uaddsat_vl   : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>;  def riscv_ssubsat_vl   : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>; @@ -338,13 +339,6 @@ def riscv_vrgatherei16_vv_vl : SDNode<"RISCVISD::VRGATHEREI16_VV_VL",                                                             SDTCisSameNumEltsAs<0, 4>,                                                             SDTCisVT<5, XLenVT>]>>; -def SDT_RISCVSelect_VL  : SDTypeProfile<1, 4, [ -  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>, -  SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisVT<4, XLenVT> -]>; - -def riscv_vselect_vl  : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>; -  def SDT_RISCVVMERGE_VL  : SDTypeProfile<1, 5, [    SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,    SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameAs<0, 4>, @@ -1722,21 +1716,21 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {                (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")                     vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,                     (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; -    def : Pat<(riscv_vselect_vl (vti.Mask V0), +    def : Pat<(riscv_vmerge_vl (vti.Mask V0),                  (vti.Vector (op vti.RegClass:$rd,                                  (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,                                      srcvalue, (vti.Mask true_mask), VLOpFrag),                                  srcvalue, (vti.Mask true_mask), VLOpFrag)), -                            vti.RegClass:$rd, VLOpFrag), +                            vti.RegClass:$rd, undef, VLOpFrag),                (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")                     vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,                     (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; -    def : Pat<(riscv_vselect_vl (vti.Mask V0), +    def : Pat<(riscv_vmerge_vl (vti.Mask V0),                  (vti.Vector (op vti.RegClass:$rd,                                  (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,                                      srcvalue, (vti.Mask true_mask), VLOpFrag),                                  srcvalue, (vti.Mask true_mask), VLOpFrag)), -                            vti.RegClass:$rd, VLOpFrag), +                            vti.RegClass:$rd, undef, VLOpFrag),                (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")                     vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,                     (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; @@ -1861,17 +1855,17 @@ multiclass VPatFPMulAccVL_VV_VF<PatFrag vop, string instruction_name> {                (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")                     vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,                     (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; -    def : Pat<(riscv_vselect_vl (vti.Mask V0), +    def : Pat<(riscv_vmerge_vl (vti.Mask V0),                             (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,                              vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), -                            vti.RegClass:$rd, VLOpFrag), +                            vti.RegClass:$rd, undef, VLOpFrag),                (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")                     vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,                     (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; -    def : Pat<(riscv_vselect_vl (vti.Mask V0), +    def : Pat<(riscv_vmerge_vl (vti.Mask V0),                             (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,                              vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), -                            vti.RegClass:$rd, VLOpFrag), +                            vti.RegClass:$rd, undef, VLOpFrag),                (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")                     vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,                     (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; @@ -1905,10 +1899,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {                     // RISCVInsertReadWriteCSR                     FRM_DYN,                     GPR:$vl, vti.Log2SEW, TU_MU)>; -    def : Pat<(riscv_vselect_vl (vti.Mask V0), +    def : Pat<(riscv_vmerge_vl (vti.Mask V0),                             (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,                              vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), -                            vti.RegClass:$rd, VLOpFrag), +                            vti.RegClass:$rd, undef, VLOpFrag),                (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")                     vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,                     (vti.Mask V0), @@ -1916,10 +1910,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {                     // RISCVInsertReadWriteCSR                     FRM_DYN,                     GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; -    def : Pat<(riscv_vselect_vl (vti.Mask V0), +    def : Pat<(riscv_vmerge_vl (vti.Mask V0),                             (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,                              vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), -                            vti.RegClass:$rd, VLOpFrag), +                            vti.RegClass:$rd, undef, VLOpFrag),                (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")                     vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,                     (vti.Mask V0), @@ -2255,31 +2249,6 @@ foreach vtiTowti = AllWidenableIntVectors in {  // 11.15. Vector Integer Merge Instructions  foreach vti = AllIntegerVectors in {    let Predicates = GetVTypePredicates<vti>.Predicates in { -    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0), -                                            vti.RegClass:$rs1, -                                            vti.RegClass:$rs2, -                                            VLOpFrag)), -              (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX) -                   (vti.Vector (IMPLICIT_DEF)), -                   vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask V0), -                   GPR:$vl, vti.Log2SEW)>; - -    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0), -                                            (SplatPat XLenVT:$rs1), -                                            vti.RegClass:$rs2, -                                            VLOpFrag)), -              (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX) -                   (vti.Vector (IMPLICIT_DEF)), -                   vti.RegClass:$rs2, GPR:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; - -    def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0), -                                            (SplatPat_simm5 simm5:$rs1), -                                            vti.RegClass:$rs2, -                                            VLOpFrag)), -              (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX) -                   (vti.Vector (IMPLICIT_DEF)), -                   vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; -      def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),                                             vti.RegClass:$rs1,                                             vti.RegClass:$rs2, @@ -2338,6 +2307,24 @@ defm : VPatBinaryVL_VV_VX_VI<riscv_uaddsat_vl, "PseudoVSADDU">;  defm : VPatBinaryVL_VV_VX<riscv_ssubsat_vl, "PseudoVSSUB">;  defm : VPatBinaryVL_VV_VX<riscv_usubsat_vl, "PseudoVSSUBU">; +// 12.2. Vector Single-Width Averaging Add and Subtract +foreach vti = AllIntegerVectors in { +  let Predicates = GetVTypePredicates<vti>.Predicates in { +    def : Pat<(riscv_avgflooru_vl (vti.Vector vti.RegClass:$rs1), +                                  (vti.Vector vti.RegClass:$rs2), +                                  vti.RegClass:$merge, (vti.Mask V0), VLOpFrag), +              (!cast<Instruction>("PseudoVAADDU_VV_"#vti.LMul.MX#"_MASK") +                vti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs2, +                (vti.Mask V0), 0b10, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; +    def : Pat<(riscv_avgflooru_vl (vti.Vector vti.RegClass:$rs1), +                                  (vti.Vector (SplatPat (XLenVT GPR:$rs2))), +                                  vti.RegClass:$merge, (vti.Mask V0), VLOpFrag), +              (!cast<Instruction>("PseudoVAADDU_VX_"#vti.LMul.MX#"_MASK") +                vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2, +                (vti.Mask V0), 0b10, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; +  } +} +  // 12.5. Vector Narrowing Fixed-Point Clip Instructions  class VPatTruncSatClipMaxMinBase<string inst,                                   VTypeInfo vti, @@ -2534,33 +2521,6 @@ foreach fvti = AllFloatVectors in {    // 13.15. Vector Floating-Point Merge Instruction    defvar ivti = GetIntVTypeInfo<fvti>.Vti;    let Predicates = GetVTypePredicates<ivti>.Predicates in { -    def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0), -                                             fvti.RegClass:$rs1, -                                             fvti.RegClass:$rs2, -                                             VLOpFrag)), -              (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX) -                   (fvti.Vector (IMPLICIT_DEF)), -                   fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0), -                   GPR:$vl, fvti.Log2SEW)>; - -    def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0), -                                             (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))), -                                             fvti.RegClass:$rs2, -                                             VLOpFrag)), -              (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) -                   (fvti.Vector (IMPLICIT_DEF)), -                   fvti.RegClass:$rs2, -                   GPR:$imm, -                   (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; - -    def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0), -                                             (SplatFPOp (fvti.Scalar fpimm0)), -                                             fvti.RegClass:$rs2, -                                             VLOpFrag)), -              (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) -                   (fvti.Vector (IMPLICIT_DEF)), -                   fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; -    def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),                                            fvti.RegClass:$rs1,                                            fvti.RegClass:$rs2, @@ -2571,6 +2531,16 @@ foreach fvti = AllFloatVectors in {                   GPR:$vl, fvti.Log2SEW)>;    def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), +                                          (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))), +                                          fvti.RegClass:$rs2, +                                          fvti.RegClass:$merge, +                                          VLOpFrag)), +            (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) +                 fvti.RegClass:$merge, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0), +                 GPR:$vl, fvti.Log2SEW)>; + + +  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),                                            (SplatFPOp (fvti.Scalar fpimm0)),                                            fvti.RegClass:$rs2,                                            fvti.RegClass:$merge, @@ -2581,16 +2551,6 @@ foreach fvti = AllFloatVectors in {    }    let Predicates = GetVTypePredicates<fvti>.Predicates in { -    def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0), -                                             (SplatFPOp fvti.ScalarRegClass:$rs1), -                                             fvti.RegClass:$rs2, -                                             VLOpFrag)), -              (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX) -                   (fvti.Vector (IMPLICIT_DEF)), -                   fvti.RegClass:$rs2, -                   (fvti.Scalar fvti.ScalarRegClass:$rs1), -                   (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; -      def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),                                              (SplatFPOp fvti.ScalarRegClass:$rs1),                                              fvti.RegClass:$rs2, diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td new file mode 100644 index 000000000000..a09f5715b24f --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td @@ -0,0 +1,44 @@ +//===-- RISCVInstrInfoZa.td - RISC-V Atomic instructions ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the RISC-V instructions from the standard atomic 'Za*' +// extensions: +//   - Zawrs (v1.0) : Wait-on-Reservation-Set. +//   - Zacas (v1.0-rc1) : Atomic Compare-and-Swap. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Zacas (Atomic Compare-and-Swap) +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtZacas] in { +defm AMOCAS_W : AMO_rr_aq_rl<0b00101, 0b010, "amocas.w">; +defm AMOCAS_D : AMO_rr_aq_rl<0b00101, 0b011, "amocas.d">; +} // Predicates = [HasStdExtZacas] + +let Predicates = [HasStdExtZacas, IsRV64] in { +defm AMOCAS_Q : AMO_rr_aq_rl<0b00101, 0b100, "amocas.q">; +} // Predicates = [HasStdExtZacas, IsRV64] + +//===----------------------------------------------------------------------===// +// Zawrs (Wait-on-Reservation-Set) +//===----------------------------------------------------------------------===// + +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +class WRSInst<bits<12> funct12, string opcodestr> +    : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), opcodestr, ""> { +  let rs1 = 0; +  let rd = 0; +  let imm12 = funct12; +} + +let Predicates = [HasStdExtZawrs] in { +def WRS_NTO : WRSInst<0b000000001101, "wrs.nto">, Sched<[]>; +def WRS_STO : WRSInst<0b000000011101, "wrs.sto">, Sched<[]>; +} // Predicates = [HasStdExtZawrs] diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index 2c2b34bb5b77..c16eee67f3c5 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -126,7 +126,11 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,      if (MI->getNumExplicitDefs() != 1)        return false; -    for (auto &UserOp : MRI.use_nodbg_operands(MI->getOperand(0).getReg())) { +    Register DestReg = MI->getOperand(0).getReg(); +    if (!DestReg.isVirtual()) +      return false; + +    for (auto &UserOp : MRI.use_nodbg_operands(DestReg)) {        const MachineInstr *UserMI = UserOp.getParent();        unsigned OpIdx = UserOp.getOperandNo(); diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVProcessors.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVProcessors.td index ba8996e710ed..52800f086129 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -232,7 +232,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", NoSchedModel,                                         FeatureStdExtZba,                                         FeatureStdExtZbb,                                         FeatureStdExtZbs, -                                       FeatureStdExtZfhmin]>; +                                       FeatureStdExtZfhmin], +                                      [TuneConditionalCompressedMoveFusion]>;  def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",                                                SyntacoreSCR1Model, diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 840fd149d681..a59d058382fe 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -487,7 +487,7 @@ defvar VMaskVTs = [vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t,  defvar VM1VTs = [vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t,                   vbfloat16m1_t, vfloat16m1_t, vfloat32m1_t,                   vfloat64m1_t, vint8mf2_t, vint8mf4_t, vint8mf8_t, -	               vint16mf2_t, vint16mf4_t, vint32mf2_t, +                 vint16mf2_t, vint16mf4_t, vint32mf2_t,                   vfloat16mf4_t, vfloat16mf2_t, vbfloat16mf4_t,                   vbfloat16mf2_t, vfloat32mf2_t]; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h index 26320b05d9be..2ba93764facd 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -150,6 +150,13 @@ public:    bool hasHalfFPLoadStoreMove() const {      return HasStdExtZfhmin || HasStdExtZfbfmin;    } + +  bool hasConditionalMoveFusion() const { +    // Do we support fusing a branch+mv or branch+c.mv as a conditional move. +    return (hasConditionalCompressedMoveFusion() && hasStdExtCOrZca()) || +           hasShortForwardBranchOpt(); +  } +    bool is64Bit() const { return IsRV64; }    MVT getXLenVT() const {      return is64Bit() ? MVT::i64 : MVT::i32; diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 4614446b2150..b3916c987005 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -34,6 +34,65 @@ static cl::opt<unsigned> SLPMaxVF(          "exclusively by SLP vectorizer."),      cl::Hidden); +InstructionCost +RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, +                                      TTI::TargetCostKind CostKind) { +  size_t NumInstr = OpCodes.size(); +  if (CostKind == TTI::TCK_CodeSize) +    return NumInstr; +  InstructionCost LMULCost = TLI->getLMULCost(VT); +  if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) +    return LMULCost * NumInstr; +  InstructionCost Cost = 0; +  for (auto Op : OpCodes) { +    switch (Op) { +    case RISCV::VRGATHER_VI: +      Cost += TLI->getVRGatherVICost(VT); +      break; +    case RISCV::VRGATHER_VV: +      Cost += TLI->getVRGatherVVCost(VT); +      break; +    case RISCV::VSLIDEUP_VI: +    case RISCV::VSLIDEDOWN_VI: +      Cost += TLI->getVSlideVICost(VT); +      break; +    case RISCV::VSLIDEUP_VX: +    case RISCV::VSLIDEDOWN_VX: +      Cost += TLI->getVSlideVXCost(VT); +      break; +    case RISCV::VREDMAX_VS: +    case RISCV::VREDMIN_VS: +    case RISCV::VREDMAXU_VS: +    case RISCV::VREDMINU_VS: +    case RISCV::VREDSUM_VS: +    case RISCV::VREDAND_VS: +    case RISCV::VREDOR_VS: +    case RISCV::VREDXOR_VS: +    case RISCV::VFREDMAX_VS: +    case RISCV::VFREDMIN_VS: +    case RISCV::VFREDUSUM_VS: { +      unsigned VL = VT.getVectorMinNumElements(); +      if (!VT.isFixedLengthVector()) +        VL *= *getVScaleForTuning(); +      Cost += Log2_32_Ceil(VL); +      break; +    } +    case RISCV::VFREDOSUM_VS: { +      unsigned VL = VT.getVectorMinNumElements(); +      if (!VT.isFixedLengthVector()) +        VL *= *getVScaleForTuning(); +      Cost += VL; +      break; +    } +    case RISCV::VMV_S_X: +      // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1 +    default: +      Cost += LMULCost; +    } +  } +  return Cost; +} +  InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,                                              TTI::TargetCostKind CostKind) {    assert(Ty->isIntegerTy() && @@ -281,7 +340,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,              // Example sequence:              //   vnsrl.wi   v10, v8, 0              if (equal(DeinterleaveMask, Mask)) -              return LT.first * TLI->getLMULCost(LT.second); +              return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, +                                                        LT.second, CostKind);            }          }        } @@ -292,7 +352,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,             LT.second.getVectorNumElements() <= 256)) {          VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext());          InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); -        return IndexCost + TLI->getVRGatherVVCost(LT.second); +        return IndexCost + +               getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);        }        [[fallthrough]];      } @@ -310,7 +371,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,          VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC);          InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);          InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); -        return 2 * IndexCost + 2 * TLI->getVRGatherVVCost(LT.second) + MaskCost; +        return 2 * IndexCost + +               getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, +                                       LT.second, CostKind) + +               MaskCost;        }        [[fallthrough]];      } @@ -365,19 +429,24 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,      // Example sequence:      // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)      // vslidedown.vi  v8, v9, 2 -    return LT.first * TLI->getVSlideCost(LT.second); +    return LT.first * +           getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);    case TTI::SK_InsertSubvector:      // Example sequence:      // vsetivli     zero, 4, e8, mf2, tu, ma (ignored)      // vslideup.vi  v8, v9, 2 -    return LT.first * TLI->getVSlideCost(LT.second); +    return LT.first * +           getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);    case TTI::SK_Select: {      // Example sequence:      // li           a0, 90      // vsetivli     zero, 8, e8, mf2, ta, ma (ignored)      // vmv.s.x      v0, a0      // vmerge.vvm   v8, v9, v8, v0 -    return LT.first * 3 * TLI->getLMULCost(LT.second); +    return LT.first * +           (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for li +            getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, +                                    LT.second, CostKind));    }    case TTI::SK_Broadcast: {      bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == @@ -389,7 +458,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,          //   vsetivli zero, 2, e8, mf8, ta, ma (ignored)          //   vmv.v.x v8, a0          //   vmsne.vi v0, v8, 0 -        return LT.first * TLI->getLMULCost(LT.second) * 3; +        return LT.first * +               (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi +                getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, +                                        LT.second, CostKind));        }        // Example sequence:        //   vsetivli  zero, 2, e8, mf8, ta, mu (ignored) @@ -400,24 +472,38 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,        //   vmv.v.x v8, a0        //   vmsne.vi  v0, v8, 0 -      return LT.first * TLI->getLMULCost(LT.second) * 6; +      return LT.first * +             (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi +              TLI->getLMULCost( +                  LT.second) + // FIXME: vmv.x.s is the same as extractelement +              getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, +                                       RISCV::VMV_V_X, RISCV::VMSNE_VI}, +                                      LT.second, CostKind));      }      if (HasScalar) {        // Example sequence:        //   vmv.v.x v8, a0 -      return LT.first * TLI->getLMULCost(LT.second); +      return LT.first * +             getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);      }      // Example sequence:      //   vrgather.vi     v9, v8, 0 -    return LT.first * TLI->getVRGatherVICost(LT.second); +    return LT.first * +           getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);    } -  case TTI::SK_Splice: +  case TTI::SK_Splice: {      // vslidedown+vslideup.      // TODO: Multiplying by LT.first implies this legalizes into multiple copies      // of similar code, but I think we expand through memory. -    return 2 * LT.first * TLI->getVSlideCost(LT.second); +    unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; +    if (Index >= 0 && Index < 32) +      Opcodes[0] = RISCV::VSLIDEDOWN_VI; +    else if (Index < 0 && Index > -32) +      Opcodes[1] = RISCV::VSLIDEUP_VI; +    return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); +  }    case TTI::SK_Reverse: {      // TODO: Cases to improve here:      // * Illegal vector types @@ -437,7 +523,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,      if (LT.second.isFixedLengthVector())        // vrsub.vi has a 5 bit immediate field, otherwise an li suffices        LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; -    InstructionCost GatherCost = 2 + TLI->getVRGatherVVCost(LT.second); +    // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX} +    InstructionCost GatherCost = +        2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);      // Mask operation additionally required extend and truncate      InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0;      return LT.first * (LenCost + GatherCost + ExtendCost); diff --git a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 4c955744b37d..7e5dbddb5b51 100644 --- a/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -48,6 +48,9 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {    /// actual target hardware.    unsigned getEstimatedVLFor(VectorType *Ty); +  InstructionCost getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, +                                          TTI::TargetCostKind CostKind); +    /// Return the cost of accessing a constant pool entry of the specified    /// type.    InstructionCost getConstantPoolLoadCost(Type *Ty, diff --git a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 629db8e2eb4d..0a8b5499a1fc 100644 --- a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -211,8 +211,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,    MDString *MDKernelArgType =        getKernelArgAttribute(F, ArgIdx, "kernel_arg_type"); -  if (!MDKernelArgType || (MDKernelArgType->getString().ends_with("*") && -                           MDKernelArgType->getString().ends_with("_t"))) +  if (!MDKernelArgType || (!MDKernelArgType->getString().ends_with("*") && +                           !MDKernelArgType->getString().ends_with("_t")))      return GR->getOrCreateSPIRVType(OriginalArgType, MIRBuilder, ArgAccessQual);    if (MDKernelArgType->getString().ends_with("*")) @@ -438,7 +438,8 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,        assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs");        ArgVRegs.push_back(Arg.Regs[0]);        SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(Arg.Ty, MIRBuilder); -      GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF()); +      if (!GR->getSPIRVTypeForVReg(Arg.Regs[0])) +        GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF());      }      if (auto Res = SPIRV::lowerBuiltin(              DemangledName, SPIRV::InstructionSet::OpenCL_std, MIRBuilder, diff --git a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 660c574daf38..fb4e9932dd2d 100644 --- a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -74,6 +74,7 @@ class SPIRVEmitIntrinsics    void processInstrAfterVisit(Instruction *I);    void insertAssignPtrTypeIntrs(Instruction *I);    void insertAssignTypeIntrs(Instruction *I); +  void insertPtrCastInstr(Instruction *I);    void processGlobalValue(GlobalVariable &GV);  public: @@ -255,7 +256,19 @@ Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) {  }  Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) { -  SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()}; +  Value *Source = I.getOperand(0); + +  // SPIR-V, contrary to LLVM 17+ IR, supports bitcasts between pointers of +  // varying element types. In case of IR coming from older versions of LLVM +  // such bitcasts do not provide sufficient information, should be just skipped +  // here, and handled in insertPtrCastInstr. +  if (I.getType()->isPointerTy()) { +    I.replaceAllUsesWith(Source); +    I.eraseFromParent(); +    return nullptr; +  } + +  SmallVector<Type *, 2> Types = {I.getType(), Source->getType()};    SmallVector<Value *> Args(I.op_begin(), I.op_end());    auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_bitcast, {Types}, {Args});    std::string InstName = I.hasName() ? I.getName().str() : ""; @@ -265,6 +278,111 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {    return NewI;  } +void SPIRVEmitIntrinsics::insertPtrCastInstr(Instruction *I) { +  Value *Pointer; +  Type *ExpectedElementType; +  unsigned OperandToReplace; +  if (StoreInst *SI = dyn_cast<StoreInst>(I)) { +    Pointer = SI->getPointerOperand(); +    ExpectedElementType = SI->getValueOperand()->getType(); +    OperandToReplace = 1; +  } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { +    Pointer = LI->getPointerOperand(); +    ExpectedElementType = LI->getType(); +    OperandToReplace = 0; +  } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { +    Pointer = GEPI->getPointerOperand(); +    ExpectedElementType = GEPI->getSourceElementType(); +    OperandToReplace = 0; +  } else { +    return; +  } + +  // If Pointer is the result of nop BitCastInst (ptr -> ptr), use the source +  // pointer instead. The BitCastInst should be later removed when visited. +  while (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer)) +    Pointer = BC->getOperand(0); + +  // Do not emit spv_ptrcast if Pointer is a GlobalValue of expected type. +  GlobalValue *GV = dyn_cast<GlobalValue>(Pointer); +  if (GV && GV->getValueType() == ExpectedElementType) +    return; + +  // Do not emit spv_ptrcast if Pointer is a result of alloca with expected +  // type. +  AllocaInst *A = dyn_cast<AllocaInst>(Pointer); +  if (A && A->getAllocatedType() == ExpectedElementType) +    return; + +  if (dyn_cast<GetElementPtrInst>(Pointer)) +    return; + +  setInsertPointSkippingPhis(*IRB, I); +  Constant *ExpectedElementTypeConst = +      Constant::getNullValue(ExpectedElementType); +  ConstantAsMetadata *CM = +      ValueAsMetadata::getConstant(ExpectedElementTypeConst); +  MDTuple *TyMD = MDNode::get(F->getContext(), CM); +  MetadataAsValue *VMD = MetadataAsValue::get(F->getContext(), TyMD); +  unsigned AddressSpace = Pointer->getType()->getPointerAddressSpace(); +  bool FirstPtrCastOrAssignPtrType = true; + +  // Do not emit new spv_ptrcast if equivalent one already exists or when +  // spv_assign_ptr_type already targets this pointer with the same element +  // type. +  for (auto User : Pointer->users()) { +    auto *II = dyn_cast<IntrinsicInst>(User); +    if (!II || +        (II->getIntrinsicID() != Intrinsic::spv_assign_ptr_type && +         II->getIntrinsicID() != Intrinsic::spv_ptrcast) || +        II->getOperand(0) != Pointer) +      continue; + +    // There is some spv_ptrcast/spv_assign_ptr_type already targeting this +    // pointer. +    FirstPtrCastOrAssignPtrType = false; +    if (II->getOperand(1) != VMD || +        dyn_cast<ConstantInt>(II->getOperand(2))->getSExtValue() != +            AddressSpace) +      continue; + +    // The spv_ptrcast/spv_assign_ptr_type targeting this pointer is of the same +    // element type and address space. +    if (II->getIntrinsicID() != Intrinsic::spv_ptrcast) +      return; + +    // This must be a spv_ptrcast, do not emit new if this one has the same BB +    // as I. Otherwise, search for other spv_ptrcast/spv_assign_ptr_type. +    if (II->getParent() != I->getParent()) +      continue; + +    I->setOperand(OperandToReplace, II); +    return; +  } + +  // Do not emit spv_ptrcast if it would cast to the default pointer element +  // type (i8) of the same address space. +  if (ExpectedElementType->isIntegerTy(8)) +    return; + +  // If this would be the first spv_ptrcast and there is no spv_assign_ptr_type +  // for this pointer before, do not emit spv_ptrcast but emit +  // spv_assign_ptr_type instead. +  if (FirstPtrCastOrAssignPtrType && isa<Instruction>(Pointer)) { +    buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Pointer->getType()}, +                    ExpectedElementTypeConst, Pointer, +                    {IRB->getInt32(AddressSpace)}); +    return; +  } else { +    SmallVector<Type *, 2> Types = {Pointer->getType(), Pointer->getType()}; +    SmallVector<Value *, 2> Args = {Pointer, VMD, IRB->getInt32(AddressSpace)}; +    auto *PtrCastI = +        IRB->CreateIntrinsic(Intrinsic::spv_ptrcast, {Types}, Args); +    I->setOperand(OperandToReplace, PtrCastI); +    return; +  } +} +  Instruction *SPIRVEmitIntrinsics::visitInsertElementInst(InsertElementInst &I) {    SmallVector<Type *, 4> Types = {I.getType(), I.getOperand(0)->getType(),                                    I.getOperand(1)->getType(), @@ -522,13 +640,18 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {    for (auto &I : Worklist) {      insertAssignPtrTypeIntrs(I);      insertAssignTypeIntrs(I); +    insertPtrCastInstr(I);    }    for (auto *I : Worklist) {      TrackConstants = true;      if (!I->getType()->isVoidTy() || isa<StoreInst>(I))        IRB->SetInsertPoint(I->getNextNode()); +    // Visitors return either the original/newly created instruction for further +    // processing, nullptr otherwise.      I = visit(*I); +    if (!I) +      continue;      processInstrAfterVisit(I);    }    return true; diff --git a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index faaf7f0e2548..061bc9674237 100644 --- a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -289,8 +289,9 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpirvType,    return ConvReg;  } -bool SPIRVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, -                                        MachineInstr &MI) const { +bool SPIRVLegalizerInfo::legalizeCustom( +    LegalizerHelper &Helper, MachineInstr &MI, +    LostDebugLocObserver &LocObserver) const {    auto Opc = MI.getOpcode();    MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();    if (!isTypeFoldingSupported(Opc)) { diff --git a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h index 2541ff29edb0..f18b15b7f169 100644 --- a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h @@ -29,7 +29,8 @@ class SPIRVLegalizerInfo : public LegalizerInfo {    SPIRVGlobalRegistry *GR;  public: -  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; +  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, +                      LostDebugLocObserver &LocObserver) const override;    SPIRVLegalizerInfo(const SPIRVSubtarget &ST);  };  } // namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 1bfce70fedc0..cbc16fa98661 100644 --- a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -125,12 +125,32 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,    SmallVector<MachineInstr *, 10> ToErase;    for (MachineBasicBlock &MBB : MF) {      for (MachineInstr &MI : MBB) { -      if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) +      if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) && +          !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast))          continue;        assert(MI.getOperand(2).isReg());        MIB.setInsertPt(*MI.getParent(), MI); -      MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());        ToErase.push_back(&MI); +      if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) { +        MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); +        continue; +      } +      Register Def = MI.getOperand(0).getReg(); +      Register Source = MI.getOperand(2).getReg(); +      SPIRVType *BaseTy = GR->getOrCreateSPIRVType( +          getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB); +      SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( +          BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(), +          addressSpaceToStorageClass(MI.getOperand(4).getImm())); + +      // If the bitcast would be redundant, replace all uses with the source +      // register. +      if (GR->getSPIRVTypeForVReg(Source) == AssignedPtrType) { +        MIB.getMRI()->replaceRegWith(Def, Source); +      } else { +        GR->assignSPIRVTypeToVReg(AssignedPtrType, Def, MF); +        MIB.buildBitcast(Def, Source); +      }      }    }    for (MachineInstr *MI : ToErase) @@ -587,6 +607,40 @@ static void processSwitches(MachineFunction &MF, SPIRVGlobalRegistry *GR,    }  } +static bool isImplicitFallthrough(MachineBasicBlock &MBB) { +  if (MBB.empty()) +    return true; + +  // Branching SPIR-V intrinsics are not detected by this generic method. +  // Thus, we can only trust negative result. +  if (!MBB.canFallThrough()) +    return false; + +  // Otherwise, we must manually check if we have a SPIR-V intrinsic which +  // prevent an implicit fallthrough. +  for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); +       It != E; ++It) { +    if (isSpvIntrinsic(*It, Intrinsic::spv_switch)) +      return false; +  } +  return true; +} + +static void removeImplicitFallthroughs(MachineFunction &MF, +                                       MachineIRBuilder MIB) { +  // It is valid for MachineBasicBlocks to not finish with a branch instruction. +  // In such cases, they will simply fallthrough their immediate successor. +  for (MachineBasicBlock &MBB : MF) { +    if (!isImplicitFallthrough(MBB)) +      continue; + +    assert(std::distance(MBB.successors().begin(), MBB.successors().end()) == +           1); +    MIB.setInsertPt(MBB, MBB.end()); +    MIB.buildBr(**MBB.successors().begin()); +  } +} +  bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {    // Initialize the type registry.    const SPIRVSubtarget &ST = MF.getSubtarget<SPIRVSubtarget>(); @@ -599,6 +653,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {    generateAssignInstrs(MF, GR, MIB);    processSwitches(MF, GR, MIB);    processInstrsWithTypeFolding(MF, GR, MIB); +  removeImplicitFallthroughs(MF, MIB);    return true;  } diff --git a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 1503f263e42c..62d9090d289f 100644 --- a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -29,6 +29,7 @@  #include "llvm/MC/TargetRegistry.h"  #include "llvm/Pass.h"  #include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Utils.h"  #include <optional>  using namespace llvm; @@ -151,6 +152,19 @@ TargetPassConfig *SPIRVTargetMachine::createPassConfig(PassManagerBase &PM) {  }  void SPIRVPassConfig::addIRPasses() { +  if (TM.getSubtargetImpl()->isVulkanEnv()) { +    // Once legalized, we need to structurize the CFG to follow the spec. +    // This is done through the following 8 steps. +    // TODO(#75801): add the remaining steps. + +    // 1.  Simplify loop for subsequent transformations. After this steps, loops +    // have the following properties: +    //  - loops have a single entry edge (pre-header to loop header). +    //  - all loop exits are dominated by the loop pre-header. +    //  - loops have a single back-edge. +    addPass(createLoopSimplifyPass()); +  } +    TargetPassConfig::addIRPasses();    addPass(createSPIRVRegularizerPass());    addPass(createSPIRVPrepareFunctionsPass(TM)); diff --git a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 1c0e8d84e2fd..d4f7d8e89af5 100644 --- a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -228,8 +228,8 @@ uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) {    return MI->getOperand(1).getCImm()->getValue().getZExtValue();  } -bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) { -  if (auto *GI = dyn_cast<GIntrinsic>(&MI)) +bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID) { +  if (const auto *GI = dyn_cast<GIntrinsic>(&MI))      return GI->is(IntrinsicID);    return false;  } diff --git a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVUtils.h b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVUtils.h index 30fae6c7de47..60742e2f2728 100644 --- a/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/contrib/llvm-project/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -79,7 +79,7 @@ MachineInstr *getDefInstrMaybeConstant(Register &ConstReg,  uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI);  // Check if MI is a SPIR-V specific intrinsic call. -bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID); +bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID);  // Get type of i-th operand of the metadata node.  Type *getMDOperandAsType(const MDNode *N, unsigned I); diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index c7d8591c5bdf..320f91c76057 100644 --- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -1641,7 +1641,7 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {      // If this is a 64-bit constant that is out of the range of LLILF,      // LLIHF and LGFI, split it into two 32-bit pieces.      if (Node->getValueType(0) == MVT::i64) { -      uint64_t Val = cast<ConstantSDNode>(Node)->getZExtValue(); +      uint64_t Val = Node->getAsZExtVal();        if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val)) {          splitLargeImmediate(ISD::OR, Node, SDValue(), Val - uint32_t(Val),                              uint32_t(Val)); @@ -1677,10 +1677,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) {             isInt<16>(cast<ConstantSDNode>(Op0)->getSExtValue())))) {        SDValue CCValid = Node->getOperand(2);        SDValue CCMask = Node->getOperand(3); -      uint64_t ConstCCValid = -        cast<ConstantSDNode>(CCValid.getNode())->getZExtValue(); -      uint64_t ConstCCMask = -        cast<ConstantSDNode>(CCMask.getNode())->getZExtValue(); +      uint64_t ConstCCValid = CCValid.getNode()->getAsZExtVal(); +      uint64_t ConstCCMask = CCMask.getNode()->getAsZExtVal();        // Invert the condition.        CCMask = CurDAG->getTargetConstant(ConstCCValid ^ ConstCCMask,                                           SDLoc(Node), CCMask.getValueType()); diff --git a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 045c4c0aac07..2450c6801a66 100644 --- a/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -2662,10 +2662,8 @@ static void adjustForFNeg(Comparison &C) {  // with (sext (trunc X)) into a comparison with (shl X, 32).  static void adjustForLTGFR(Comparison &C) {    // Check for a comparison between (shl X, 32) and 0. -  if (C.Op0.getOpcode() == ISD::SHL && -      C.Op0.getValueType() == MVT::i64 && -      C.Op1.getOpcode() == ISD::Constant && -      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) { +  if (C.Op0.getOpcode() == ISD::SHL && C.Op0.getValueType() == MVT::i64 && +      C.Op1.getOpcode() == ISD::Constant && C.Op1->getAsZExtVal() == 0) {      auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));      if (C1 && C1->getZExtValue() == 32) {        SDValue ShlOp0 = C.Op0.getOperand(0); @@ -2690,7 +2688,7 @@ static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,        C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&        C.Op1.getOpcode() == ISD::Constant &&        cast<ConstantSDNode>(C.Op1)->getValueSizeInBits(0) <= 64 && -      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) { +      C.Op1->getAsZExtVal() == 0) {      auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));      if (L->getMemoryVT().getStoreSizeInBits().getFixedValue() <=          C.Op0.getValueSizeInBits().getFixedValue()) { @@ -3035,12 +3033,12 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,          CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&          isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))        return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, -                             cast<ConstantSDNode>(CmpOp1)->getZExtValue(), Cond); +                             CmpOp1->getAsZExtVal(), Cond);      if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&          CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&          isIntrinsicWithCC(CmpOp0, Opcode, CCValid))        return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, -                             cast<ConstantSDNode>(CmpOp1)->getZExtValue(), Cond); +                             CmpOp1->getAsZExtVal(), Cond);    }    Comparison C(CmpOp0, CmpOp1, Chain);    C.CCMask = CCMaskForCondCode(Cond); @@ -3457,12 +3455,11 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,    // Check for absolute and negative-absolute selections, including those    // where the comparison value is sign-extended (for LPGFR and LNGFR).    // This check supplements the one in DAGCombiner. -  if (C.Opcode == SystemZISD::ICMP && -      C.CCMask != SystemZ::CCMASK_CMP_EQ && +  if (C.Opcode == SystemZISD::ICMP && C.CCMask != SystemZ::CCMASK_CMP_EQ &&        C.CCMask != SystemZ::CCMASK_CMP_NE &&        C.Op1.getOpcode() == ISD::Constant &&        cast<ConstantSDNode>(C.Op1)->getValueSizeInBits(0) <= 64 && -      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) { +      C.Op1->getAsZExtVal() == 0) {      if (isAbsolute(C.Op0, TrueOp, FalseOp))        return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);      if (isAbsolute(C.Op0, FalseOp, TrueOp)) @@ -3947,8 +3944,7 @@ SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_XPLINK(SDValue Op,    // If user has set the no alignment function attribute, ignore    // alloca alignments. -  uint64_t AlignVal = -      (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0); +  uint64_t AlignVal = (RealignOpt ? Align->getAsZExtVal() : 0);    uint64_t StackAlign = TFI->getStackAlignment();    uint64_t RequiredAlign = std::max(AlignVal, StackAlign); @@ -4013,8 +4009,7 @@ SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_ELF(SDValue Op,    // If user has set the no alignment function attribute, ignore    // alloca alignments. -  uint64_t AlignVal = -      (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0); +  uint64_t AlignVal = (RealignOpt ? Align->getAsZExtVal() : 0);    uint64_t StackAlign = TFI->getStackAlignment();    uint64_t RequiredAlign = std::max(AlignVal, StackAlign); @@ -4213,7 +4208,7 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {    // If the low part is a constant that is outside the range of LHI,    // then we're better off using IILF.    if (LowOp.getOpcode() == ISD::Constant) { -    int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue()); +    int64_t Value = int32_t(LowOp->getAsZExtVal());      if (!isInt<16>(Value))        return Op;    } @@ -5897,7 +5892,7 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,        Op1.getOpcode() != ISD::BITCAST &&        Op1.getOpcode() != ISD::ConstantFP &&        Op2.getOpcode() == ISD::Constant) { -    uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue(); +    uint64_t Index = Op2->getAsZExtVal();      unsigned Mask = VT.getVectorNumElements() - 1;      if (Index <= Mask)        return Op; diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 37abbb072cdd..15dc44a04395 100644 --- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -278,7 +278,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();          TmpOffset += SL->getElementOffset(Idx);        } else { -        uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); +        uint64_t S = GTI.getSequentialElementStride(DL);          for (;;) {            if (const auto *CI = dyn_cast<ConstantInt>(Op)) {              // Constant-offset addressing. diff --git a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4bcf89690505..7c47790d1e35 100644 --- a/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1869,8 +1869,7 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,      Ops[OpIdx++] = Op.getOperand(2);      while (OpIdx < 18) {        const SDValue &MaskIdx = Op.getOperand(OpIdx + 1); -      if (MaskIdx.isUndef() || -          cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) { +      if (MaskIdx.isUndef() || MaskIdx.getNode()->getAsZExtVal() >= 32) {          bool isTarget = MaskIdx.getNode()->getOpcode() == ISD::TargetConstant;          Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32, isTarget);        } else { @@ -1912,7 +1911,7 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,    const SDNode *Index = Extract.getOperand(1).getNode();    if (!isa<ConstantSDNode>(Index))      return SDValue(); -  unsigned IndexVal = cast<ConstantSDNode>(Index)->getZExtValue(); +  unsigned IndexVal = Index->getAsZExtVal();    unsigned Scale =        ExtractedVecT.getVectorNumElements() / VecT.getVectorNumElements();    assert(Scale > 1); @@ -2335,7 +2334,7 @@ WebAssemblyTargetLowering::LowerAccessVectorElement(SDValue Op,    SDNode *IdxNode = Op.getOperand(Op.getNumOperands() - 1).getNode();    if (isa<ConstantSDNode>(IdxNode)) {      // Ensure the index type is i32 to match the tablegen patterns -    uint64_t Idx = cast<ConstantSDNode>(IdxNode)->getZExtValue(); +    uint64_t Idx = IdxNode->getAsZExtVal();      SmallVector<SDValue, 3> Ops(Op.getNode()->ops());      Ops[Op.getNumOperands() - 1] =          DAG.getConstant(Idx, SDLoc(IdxNode), MVT::i32); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h index 1f69feceae27..12134f7b00f1 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h @@ -21,7 +21,6 @@ namespace llvm {  class X86Subtarget;  class X86TargetMachine; -/// This class provides the information for the target register banks.  class X86LegalizerInfo : public LegalizerInfo {  private:    /// Keep a reference to the X86Subtarget around so that we can diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index e006dd877360..304b998e1f26 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -148,25 +148,21 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) {    case X86::AND16ri8:    case X86::AND16rm:    case X86::AND16rr: -  case X86::AND16rr_REV:    case X86::AND32i32:    case X86::AND32ri:    case X86::AND32ri8:    case X86::AND32rm:    case X86::AND32rr: -  case X86::AND32rr_REV:    case X86::AND64i32:    case X86::AND64ri32:    case X86::AND64ri8:    case X86::AND64rm:    case X86::AND64rr: -  case X86::AND64rr_REV:    case X86::AND8i8:    case X86::AND8ri:    case X86::AND8ri8:    case X86::AND8rm:    case X86::AND8rr: -  case X86::AND8rr_REV:      return FirstMacroFusionInstKind::And;    // CMP    case X86::CMP16i16: @@ -175,28 +171,24 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) {    case X86::CMP16ri8:    case X86::CMP16rm:    case X86::CMP16rr: -  case X86::CMP16rr_REV:    case X86::CMP32i32:    case X86::CMP32mr:    case X86::CMP32ri:    case X86::CMP32ri8:    case X86::CMP32rm:    case X86::CMP32rr: -  case X86::CMP32rr_REV:    case X86::CMP64i32:    case X86::CMP64mr:    case X86::CMP64ri32:    case X86::CMP64ri8:    case X86::CMP64rm:    case X86::CMP64rr: -  case X86::CMP64rr_REV:    case X86::CMP8i8:    case X86::CMP8mr:    case X86::CMP8ri:    case X86::CMP8ri8:    case X86::CMP8rm:    case X86::CMP8rr: -  case X86::CMP8rr_REV:      return FirstMacroFusionInstKind::Cmp;    // ADD    case X86::ADD16i16: @@ -204,50 +196,42 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) {    case X86::ADD16ri8:    case X86::ADD16rm:    case X86::ADD16rr: -  case X86::ADD16rr_REV:    case X86::ADD32i32:    case X86::ADD32ri:    case X86::ADD32ri8:    case X86::ADD32rm:    case X86::ADD32rr: -  case X86::ADD32rr_REV:    case X86::ADD64i32:    case X86::ADD64ri32:    case X86::ADD64ri8:    case X86::ADD64rm:    case X86::ADD64rr: -  case X86::ADD64rr_REV:    case X86::ADD8i8:    case X86::ADD8ri:    case X86::ADD8ri8:    case X86::ADD8rm:    case X86::ADD8rr: -  case X86::ADD8rr_REV:    // SUB    case X86::SUB16i16:    case X86::SUB16ri:    case X86::SUB16ri8:    case X86::SUB16rm:    case X86::SUB16rr: -  case X86::SUB16rr_REV:    case X86::SUB32i32:    case X86::SUB32ri:    case X86::SUB32ri8:    case X86::SUB32rm:    case X86::SUB32rr: -  case X86::SUB32rr_REV:    case X86::SUB64i32:    case X86::SUB64ri32:    case X86::SUB64ri8:    case X86::SUB64rm:    case X86::SUB64rr: -  case X86::SUB64rr_REV:    case X86::SUB8i8:    case X86::SUB8ri:    case X86::SUB8ri8:    case X86::SUB8rm:    case X86::SUB8rr: -  case X86::SUB8rr_REV:      return FirstMacroFusionInstKind::AddSub;    // INC    case X86::INC16r: diff --git a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 924956295e7c..f7c361393fea 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -1650,6 +1650,9 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI,      if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)        ++SrcRegNum; +    if (IsND) // Skip new data destination +      ++CurOp; +      emitRegModRMByte(MI.getOperand(SrcRegNum),                       getX86RegNum(MI.getOperand(CurOp)), CB);      CurOp = SrcRegNum + 1; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86.h b/contrib/llvm-project/llvm/lib/Target/X86/X86.h index 485afbc1dfbc..21623a805f55 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86.h @@ -131,9 +131,9 @@ FunctionPass *createX86FixupBWInsts();  /// to another, when profitable.  FunctionPass *createX86DomainReassignmentPass(); -/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX -/// encoding when possible in order to reduce code size. -FunctionPass *createX86EvexToVexInsts(); +/// This pass compress instructions from EVEX space to legacy/VEX/EVEX space when +/// possible in order to reduce code size or facilitate HW decoding. +FunctionPass *createX86CompressEVEXPass();  /// This pass creates the thunks for the retpoline feature.  FunctionPass *createX86IndirectThunksPass(); @@ -167,7 +167,7 @@ FunctionPass *createX86SpeculativeLoadHardeningPass();  FunctionPass *createX86SpeculativeExecutionSideEffectSuppression();  FunctionPass *createX86ArgumentStackSlotPass(); -void initializeEvexToVexInstPassPass(PassRegistry &); +void initializeCompressEVEXPassPass(PassRegistry &);  void initializeFPSPass(PassRegistry &);  void initializeFixupBWInstPassPass(PassRegistry &);  void initializeFixupLEAPassPass(PassRegistry &); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86CompressEVEX.cpp index c425c37b4186..b95baddd9dea 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86EvexToVex.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -1,5 +1,4 @@ -//===- X86EvexToVex.cpp ---------------------------------------------------===// -// Compress EVEX instructions to VEX encoding when possible to reduce code size +//===- X86CompressEVEX.cpp ------------------------------------------------===//  //  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.  // See https://llvm.org/LICENSE.txt for license information. @@ -7,17 +6,30 @@  //  //===----------------------------------------------------------------------===//  // -/// \file -/// This file defines the pass that goes over all AVX-512 instructions which -/// are encoded using the EVEX prefix and if possible replaces them by their -/// corresponding VEX encoding which is usually shorter by 2 bytes. -/// EVEX instructions may be encoded via the VEX prefix when the AVX-512 -/// instruction has a corresponding AVX/AVX2 opcode, when vector length -/// accessed by instruction is less than 512 bits and when it does not use -//  the xmm or the mask registers or xmm/ymm registers with indexes higher -//  than 15. -/// The pass applies code reduction on the generated code for AVX-512 instrs. +// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space +// when possible in order to reduce code size or facilitate HW decoding.  // +// Possible compression: +//   a. AVX512 instruction (EVEX) -> AVX instruction (VEX) +//   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX) +//   c. NDD (EVEX) -> non-NDD (legacy) +//   d. NF_ND (EVEX) -> NF (EVEX) +// +// Compression a, b and c can always reduce code size, with some exceptions +// such as promoted 16-bit CRC32 which is as long as the legacy version. +// +// legacy: +//   crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6] +// promoted: +//   crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6] +// +// From performance perspective, these should be same (same uops and same EXE +// ports). From a FMV perspective, an older legacy encoding is preferred b/c it +// can execute in more places (broader HW install base). So we will still do +// the compression. +// +// Compression d can help hardware decode (HW may skip reading the NDD +// register) although the instruction length remains unchanged.  //===----------------------------------------------------------------------===//  #include "MCTargetDesc/X86BaseInfo.h" @@ -38,37 +50,34 @@  using namespace llvm; -// Including the generated EVEX2VEX tables. -struct X86EvexToVexCompressTableEntry { -  uint16_t EvexOpc; -  uint16_t VexOpc; +// Including the generated EVEX compression tables. +struct X86CompressEVEXTableEntry { +  uint16_t OldOpc; +  uint16_t NewOpc; -  bool operator<(const X86EvexToVexCompressTableEntry &RHS) const { -    return EvexOpc < RHS.EvexOpc; +  bool operator<(const X86CompressEVEXTableEntry &RHS) const { +    return OldOpc < RHS.OldOpc;    } -  friend bool operator<(const X86EvexToVexCompressTableEntry &TE, -                        unsigned Opc) { -    return TE.EvexOpc < Opc; +  friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) { +    return TE.OldOpc < Opc;    }  }; -#include "X86GenEVEX2VEXTables.inc" +#include "X86GenCompressEVEXTables.inc" -#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible" -#define EVEX2VEX_NAME "x86-evex-to-vex-compress" +#define COMP_EVEX_DESC "Compressing EVEX instrs when possible" +#define COMP_EVEX_NAME "x86-compress-evex" -#define DEBUG_TYPE EVEX2VEX_NAME +#define DEBUG_TYPE COMP_EVEX_NAME  namespace { -class EvexToVexInstPass : public MachineFunctionPass { +class CompressEVEXPass : public MachineFunctionPass {  public:    static char ID; -  EvexToVexInstPass() : MachineFunctionPass(ID) {} -  StringRef getPassName() const override { return EVEX2VEX_DESC; } +  CompressEVEXPass() : MachineFunctionPass(ID) {} +  StringRef getPassName() const override { return COMP_EVEX_DESC; } -  /// Loop over all of the basic blocks, replacing EVEX instructions -  /// by equivalent VEX instructions when possible for reducing code size.    bool runOnMachineFunction(MachineFunction &MF) override;    // This pass runs after regalloc and doesn't support VReg operands. @@ -80,7 +89,7 @@ public:  } // end anonymous namespace -char EvexToVexInstPass::ID = 0; +char CompressEVEXPass::ID = 0;  static bool usesExtendedRegister(const MachineInstr &MI) {    auto isHiRegIdx = [](unsigned Reg) { @@ -112,8 +121,8 @@ static bool usesExtendedRegister(const MachineInstr &MI) {    return false;  } -static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) { -  switch (EvexOpc) { +static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) { +  switch (OldOpc) {    default:      return true;    case X86::VCVTNEPS2BF16Z128rm: @@ -151,15 +160,15 @@ static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) {  }  // Do any custom cleanup needed to finalize the conversion. -static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) { -  (void)VexOpc; +static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { +  (void)NewOpc;    unsigned Opc = MI.getOpcode();    switch (Opc) {    case X86::VALIGNDZ128rri:    case X86::VALIGNDZ128rmi:    case X86::VALIGNQZ128rri:    case X86::VALIGNQZ128rmi: { -    assert((VexOpc == X86::VPALIGNRrri || VexOpc == X86::VPALIGNRrmi) && +    assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&             "Unexpected new opcode!");      unsigned Scale =          (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4; @@ -175,8 +184,8 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {    case X86::VSHUFI32X4Z256rri:    case X86::VSHUFI64X2Z256rmi:    case X86::VSHUFI64X2Z256rri: { -    assert((VexOpc == X86::VPERM2F128rr || VexOpc == X86::VPERM2I128rr || -            VexOpc == X86::VPERM2F128rm || VexOpc == X86::VPERM2I128rm) && +    assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr || +            NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&             "Unexpected new opcode!");      MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);      int64_t ImmVal = Imm.getImm(); @@ -200,7 +209,7 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {    case X86::VRNDSCALESDZm_Int:    case X86::VRNDSCALESSZr_Int:    case X86::VRNDSCALESSZm_Int: -    const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1); +    const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);      int64_t ImmVal = Imm.getImm();      // Ensure that only bits 3:0 of the immediate are used.      if ((ImmVal & 0xf) != ImmVal) @@ -211,86 +220,77 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {    return true;  } -// For EVEX instructions that can be encoded using VEX encoding -// replace them by the VEX encoding in order to reduce size. -static bool CompressEvexToVexImpl(MachineInstr &MI, const X86Subtarget &ST) { -  // VEX format. -  // # of bytes: 0,2,3  1      1      0,1   0,1,2,4  0,1 -  //  [Prefixes] [VEX]  OPCODE ModR/M [SIB] [DISP]  [IMM] -  // -  // EVEX format. -  //  # of bytes: 4    1      1      1      4       / 1         1 -  //  [Prefixes]  EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate] -  const MCInstrDesc &Desc = MI.getDesc(); +static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { +  uint64_t TSFlags = MI.getDesc().TSFlags;    // Check for EVEX instructions only. -  if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX) +  if ((TSFlags & X86II::EncodingMask) != X86II::EVEX)      return false; -  // Check for EVEX instructions with mask or broadcast as in these cases -  // the EVEX prefix is needed in order to carry this information -  // thus preventing the transformation to VEX encoding. -  if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B)) +  // Instructions with mask or 512-bit vector can't be converted to VEX. +  if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2))      return false; -  // Check for EVEX instructions with L2 set. These instructions are 512-bits -  // and can't be converted to VEX. -  if (Desc.TSFlags & X86II::EVEX_L2) +  // EVEX_B has several meanings. +  // AVX512: +  //  register form: rounding control or SAE +  //  memory form: broadcast +  // +  // APX: +  //  MAP4: NDD +  // +  // For AVX512 cases, EVEX prefix is needed in order to carry this information +  // thus preventing the transformation to VEX encoding. +  if (TSFlags & X86II::EVEX_B)      return false; -  // Use the VEX.L bit to select the 128 or 256-bit table. -  ArrayRef<X86EvexToVexCompressTableEntry> Table = -      (Desc.TSFlags & X86II::VEX_L) ? ArrayRef(X86EvexToVex256CompressTable) -                                    : ArrayRef(X86EvexToVex128CompressTable); +  ArrayRef<X86CompressEVEXTableEntry> Table = ArrayRef(X86CompressEVEXTable); -  unsigned EvexOpc = MI.getOpcode(); -  const auto *I = llvm::lower_bound(Table, EvexOpc); -  if (I == Table.end() || I->EvexOpc != EvexOpc) +  unsigned Opc = MI.getOpcode(); +  const auto *I = llvm::lower_bound(Table, Opc); +  if (I == Table.end() || I->OldOpc != Opc)      return false; -  if (usesExtendedRegister(MI)) -    return false; -  if (!checkVEXInstPredicate(EvexOpc, ST)) -    return false; -  if (!performCustomAdjustments(MI, I->VexOpc)) +  if (usesExtendedRegister(MI) || !checkVEXInstPredicate(Opc, ST) || +      !performCustomAdjustments(MI, I->NewOpc))      return false; -  MI.setDesc(ST.getInstrInfo()->get(I->VexOpc)); -  MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX); +  const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc); +  MI.setDesc(NewDesc); +  uint64_t Encoding = NewDesc.TSFlags & X86II::EncodingMask; +  auto AsmComment = +      (Encoding == X86II::VEX) ? X86::AC_EVEX_2_VEX : X86::AC_EVEX_2_LEGACY; +  MI.setAsmPrinterFlag(AsmComment);    return true;  } -bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { +bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {  #ifndef NDEBUG    // Make sure the tables are sorted.    static std::atomic<bool> TableChecked(false);    if (!TableChecked.load(std::memory_order_relaxed)) { -    assert(llvm::is_sorted(X86EvexToVex128CompressTable) && -           "X86EvexToVex128CompressTable is not sorted!"); -    assert(llvm::is_sorted(X86EvexToVex256CompressTable) && -           "X86EvexToVex256CompressTable is not sorted!"); +    assert(llvm::is_sorted(X86CompressEVEXTable) && +           "X86CompressEVEXTable is not sorted!");      TableChecked.store(true, std::memory_order_relaxed);    }  #endif    const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); -  if (!ST.hasAVX512()) +  if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD())      return false;    bool Changed = false; -  /// Go over all basic blocks in function and replace -  /// EVEX encoded instrs by VEX encoding when possible.    for (MachineBasicBlock &MBB : MF) {      // Traverse the basic block.      for (MachineInstr &MI : MBB) -      Changed |= CompressEvexToVexImpl(MI, ST); +      Changed |= CompressEVEXImpl(MI, ST);    }    return Changed;  } -INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false) +INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false) -FunctionPass *llvm::createX86EvexToVexInsts() { -  return new EvexToVexInstPass(); +FunctionPass *llvm::createX86CompressEVEXPass() { +  return new CompressEVEXPass();  } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp index bdd86e48fa54..20dbaf797e32 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -619,40 +619,30 @@ void X86DomainReassignment::initConverters() {          std::make_unique<InstrReplacerDstCOPY>(From, To);    }; -  bool HasEGPR = STI->hasEGPR(); -  createReplacerDstCOPY(X86::MOVZX32rm16, -                        HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm); -  createReplacerDstCOPY(X86::MOVZX64rm16, -                        HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm); +#define GET_EGPR_IF_ENABLED(OPC) STI->hasEGPR() ? OPC##_EVEX : OPC +  createReplacerDstCOPY(X86::MOVZX32rm16, GET_EGPR_IF_ENABLED(X86::KMOVWkm)); +  createReplacerDstCOPY(X86::MOVZX64rm16, GET_EGPR_IF_ENABLED(X86::KMOVWkm)); -  createReplacerDstCOPY(X86::MOVZX32rr16, -                        HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk); -  createReplacerDstCOPY(X86::MOVZX64rr16, -                        HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk); +  createReplacerDstCOPY(X86::MOVZX32rr16, GET_EGPR_IF_ENABLED(X86::KMOVWkk)); +  createReplacerDstCOPY(X86::MOVZX64rr16, GET_EGPR_IF_ENABLED(X86::KMOVWkk));    if (STI->hasDQI()) { -    createReplacerDstCOPY(X86::MOVZX16rm8, -                          HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm); -    createReplacerDstCOPY(X86::MOVZX32rm8, -                          HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm); -    createReplacerDstCOPY(X86::MOVZX64rm8, -                          HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm); - -    createReplacerDstCOPY(X86::MOVZX16rr8, -                          HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk); -    createReplacerDstCOPY(X86::MOVZX32rr8, -                          HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk); -    createReplacerDstCOPY(X86::MOVZX64rr8, -                          HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk); +    createReplacerDstCOPY(X86::MOVZX16rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm)); +    createReplacerDstCOPY(X86::MOVZX32rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm)); +    createReplacerDstCOPY(X86::MOVZX64rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm)); + +    createReplacerDstCOPY(X86::MOVZX16rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk)); +    createReplacerDstCOPY(X86::MOVZX32rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk)); +    createReplacerDstCOPY(X86::MOVZX64rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk));    }    auto createReplacer = [&](unsigned From, unsigned To) {      Converters[{MaskDomain, From}] = std::make_unique<InstrReplacer>(From, To);    }; -  createReplacer(X86::MOV16rm, HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm); -  createReplacer(X86::MOV16mr, HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk); -  createReplacer(X86::MOV16rr, HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk); +  createReplacer(X86::MOV16rm, GET_EGPR_IF_ENABLED(X86::KMOVWkm)); +  createReplacer(X86::MOV16mr, GET_EGPR_IF_ENABLED(X86::KMOVWmk)); +  createReplacer(X86::MOV16rr, GET_EGPR_IF_ENABLED(X86::KMOVWkk));    createReplacer(X86::SHR16ri, X86::KSHIFTRWri);    createReplacer(X86::SHL16ri, X86::KSHIFTLWri);    createReplacer(X86::NOT16r, X86::KNOTWrr); @@ -661,14 +651,14 @@ void X86DomainReassignment::initConverters() {    createReplacer(X86::XOR16rr, X86::KXORWrr);    if (STI->hasBWI()) { -    createReplacer(X86::MOV32rm, HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm); -    createReplacer(X86::MOV64rm, HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm); +    createReplacer(X86::MOV32rm, GET_EGPR_IF_ENABLED(X86::KMOVDkm)); +    createReplacer(X86::MOV64rm, GET_EGPR_IF_ENABLED(X86::KMOVQkm)); -    createReplacer(X86::MOV32mr, HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk); -    createReplacer(X86::MOV64mr, HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk); +    createReplacer(X86::MOV32mr, GET_EGPR_IF_ENABLED(X86::KMOVDmk)); +    createReplacer(X86::MOV64mr, GET_EGPR_IF_ENABLED(X86::KMOVQmk)); -    createReplacer(X86::MOV32rr, HasEGPR ? X86::KMOVDkk_EVEX : X86::KMOVDkk); -    createReplacer(X86::MOV64rr, HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk); +    createReplacer(X86::MOV32rr, GET_EGPR_IF_ENABLED(X86::KMOVDkk)); +    createReplacer(X86::MOV64rr, GET_EGPR_IF_ENABLED(X86::KMOVQkk));      createReplacer(X86::SHR32ri, X86::KSHIFTRDri);      createReplacer(X86::SHR64ri, X86::KSHIFTRQri); @@ -696,8 +686,8 @@ void X86DomainReassignment::initConverters() {      // TODO: KTEST is not a replacement for TEST due to flag differences. Need      // to prove only Z flag is used. -    //createReplacer(X86::TEST32rr, X86::KTESTDrr); -    //createReplacer(X86::TEST64rr, X86::KTESTQrr); +    // createReplacer(X86::TEST32rr, X86::KTESTDrr); +    // createReplacer(X86::TEST64rr, X86::KTESTQrr);    }    if (STI->hasDQI()) { @@ -706,9 +696,9 @@ void X86DomainReassignment::initConverters() {      createReplacer(X86::AND8rr, X86::KANDBrr); -    createReplacer(X86::MOV8rm, HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm); -    createReplacer(X86::MOV8mr, HasEGPR ? X86::KMOVBmk_EVEX : X86::KMOVBmk); -    createReplacer(X86::MOV8rr, HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk); +    createReplacer(X86::MOV8rm, GET_EGPR_IF_ENABLED(X86::KMOVBkm)); +    createReplacer(X86::MOV8mr, GET_EGPR_IF_ENABLED(X86::KMOVBmk)); +    createReplacer(X86::MOV8rr, GET_EGPR_IF_ENABLED(X86::KMOVBkk));      createReplacer(X86::NOT8r, X86::KNOTBrr); @@ -719,11 +709,12 @@ void X86DomainReassignment::initConverters() {      // TODO: KTEST is not a replacement for TEST due to flag differences. Need      // to prove only Z flag is used. -    //createReplacer(X86::TEST8rr, X86::KTESTBrr); -    //createReplacer(X86::TEST16rr, X86::KTESTWrr); +    // createReplacer(X86::TEST8rr, X86::KTESTBrr); +    // createReplacer(X86::TEST16rr, X86::KTESTWrr);      createReplacer(X86::XOR8rr, X86::KXORBrr);    } +#undef GET_EGPR_IF_ENABLED  }  bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp index 0ba31e173a1a..1ce1e6f6a563 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FastISel.cpp @@ -916,7 +916,7 @@ redo_gep:        // A array/variable index is always of the form i*S where S is the        // constant scale size.  See if we can push the scale into immediates. -      uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); +      uint64_t S = GTI.getSequentialElementStride(DL);        for (;;) {          if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {            // Constant-offset addressing. @@ -3046,22 +3046,24 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {      switch (II->getIntrinsicID()) {      default:        llvm_unreachable("Unexpected intrinsic."); +#define GET_EGPR_IF_ENABLED(OPC) Subtarget->hasEGPR() ? OPC##_EVEX : OPC      case Intrinsic::x86_sse42_crc32_32_8: -      Opc = X86::CRC32r32r8; +      Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r8);        RC = &X86::GR32RegClass;        break;      case Intrinsic::x86_sse42_crc32_32_16: -      Opc = X86::CRC32r32r16; +      Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r16);        RC = &X86::GR32RegClass;        break;      case Intrinsic::x86_sse42_crc32_32_32: -      Opc = X86::CRC32r32r32; +      Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r32);        RC = &X86::GR32RegClass;        break;      case Intrinsic::x86_sse42_crc32_64_64: -      Opc = X86::CRC32r64r64; +      Opc = GET_EGPR_IF_ENABLED(X86::CRC32r64r64);        RC = &X86::GR64RegClass;        break; +#undef GET_EGPR_IF_ENABLED      }      const Value *LHS = II->getArgOperand(0); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index b13bf361ab79..aad839b83ee1 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -173,7 +173,6 @@ static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {  #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \    LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \ -  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \    LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \    LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \    case X86::MNEMONIC##8ri:                                                     \ diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 73b10cf3067e..53ce720be2da 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2852,7 +2852,7 @@ bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,                                         SDValue &Index, SDValue &Disp,                                         SDValue &Segment) {    X86ISelAddressMode AM; -  AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue(); +  AM.Scale = ScaleOp->getAsZExtVal();    // Attempt to match index patterns, as long as we're not relying on implicit    // sign-extension, which is performed BEFORE scale. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp index 1e4b1361f98a..5a28240ea9e2 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7371,7 +7371,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,  /// index.  static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,                                           SDValue ExtIdx) { -  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); +  int Idx = ExtIdx->getAsZExtVal();    if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))      return Idx; @@ -7475,10 +7475,12 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {  static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,                                         const X86Subtarget &Subtarget) {    MVT VT = Op.getSimpleValueType(); -  MVT IVT = VT.changeVectorElementTypeToInteger(); +  MVT IVT = +      VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);    SmallVector<SDValue, 16> NewOps;    for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) -    NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I))); +    NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16, +                                    Op.getOperand(I)));    SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);    return DAG.getBitcast(VT, Res);  } @@ -8793,7 +8795,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {      MachineFunction &MF = DAG.getMachineFunction();      MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);      SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); -    unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue(); +    unsigned InsertC = InsIndex->getAsZExtVal();      unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();      if (InsertC < NumEltsInLow128Bits)        return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); @@ -14369,6 +14371,13 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,                                    const APInt &Zeroable,                                    const X86Subtarget &Subtarget,                                    SelectionDAG &DAG) { +  if (VT == MVT::v8bf16) { +    V1 = DAG.getBitcast(MVT::v8i16, V1); +    V2 = DAG.getBitcast(MVT::v8i16, V2); +    return DAG.getBitcast(VT, +                          DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask)); +  } +    switch (VT.SimpleTy) {    case MVT::v2i64:      return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); @@ -17096,14 +17105,14 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);    } -  if (VT == MVT::v32f16) { +  if (VT == MVT::v32f16 || VT == MVT::v32bf16) {      if (!Subtarget.hasBWI())        return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,                                    /*SimpleOnly*/ false);      V1 = DAG.getBitcast(MVT::v32i16, V1);      V2 = DAG.getBitcast(MVT::v32i16, V2); -    return DAG.getBitcast(MVT::v32f16, +    return DAG.getBitcast(VT,                            DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));    } @@ -17747,7 +17756,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,                                       DAG.getBitcast(MVT::v4i32, Vec), Idx)); -    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); +    unsigned IdxVal = Idx->getAsZExtVal();      SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,                                    DAG.getTargetConstant(IdxVal, dl, MVT::i8));      return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); @@ -21515,9 +21524,8 @@ SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,    RTLIB::Libcall LC =        RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);    SDValue Res = -      makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first; -  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, -                     DAG.getBitcast(MVT::i32, Res)); +      makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first; +  return DAG.getBitcast(MVT::i16, Res);  }  /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -24061,7 +24069,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {    // a >= b ? -1 :  0 -> RES = setcc_carry    // a >= b ?  0 : -1 -> RES = ~setcc_carry    if (Cond.getOpcode() == X86ISD::SUB) { -    unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); +    unsigned CondCode = CC->getAsZExtVal();      if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&          (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && @@ -25359,8 +25367,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,        if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&            Src3.getValueType() != MVT::i8) { -        Src3 = DAG.getTargetConstant( -            cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8); +        Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);        }        // We specify 2 possible opcodes for intrinsics with rounding modes. @@ -25385,8 +25392,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,        assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);        SDValue Src4 = Op.getOperand(4);        if (Src4.getValueType() != MVT::i8) { -        Src4 = DAG.getTargetConstant( -            cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8); +        Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);        }        return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), @@ -26788,7 +26794,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,                                    {Chain, Op1, Op2, Size}, VT, MMO);        Chain = Res.getValue(1);        Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT); -      unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); +      unsigned Imm = Op2->getAsZExtVal();        if (Imm)          Res = DAG.getNode(ISD::SHL, DL, VT, Res,                            DAG.getShiftAmountConstant(Imm, VT, DL)); @@ -40221,6 +40227,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,      }      return SDValue();    } +  case X86ISD::SHUF128: { +    // If we're permuting the upper 256-bits subvectors of a concatenation, then +    // see if we can peek through and access the subvector directly. +    if (VT.is512BitVector()) { +      // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the +      // upper subvector is used. +      SDValue LHS = N->getOperand(0); +      SDValue RHS = N->getOperand(1); +      uint64_t Mask = N->getConstantOperandVal(2); +      SmallVector<SDValue> LHSOps, RHSOps; +      SDValue NewLHS, NewRHS; +      if ((Mask & 0x0A) == 0x0A && +          collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) { +        NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512); +        Mask &= ~0x0A; +      } +      if ((Mask & 0xA0) == 0xA0 && +          collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) { +        NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512); +        Mask &= ~0xA0; +      } +      if (NewLHS || NewRHS) +        return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS, +                           NewRHS ? NewRHS : RHS, +                           DAG.getTargetConstant(Mask, DL, MVT::i8)); +    } +    return SDValue(); +  }    case X86ISD::VPERM2X128: {      // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).      SDValue LHS = N->getOperand(0); @@ -41320,6 +41354,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(        return TLO.CombineTo(Op, Src);      break;    } +  case X86ISD::VZEXT_LOAD: { +    // If upper demanded elements are not demanded then simplify to a +    // scalar_to_vector(load()). +    MVT SVT = VT.getSimpleVT().getVectorElementType(); +    if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) { +      SDLoc DL(Op); +      auto *Mem = cast<MemSDNode>(Op); +      SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(), +                                    Mem->getMemOperand()); +      SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt); +      return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec)); +    } +    break; +  }    case X86ISD::VBROADCAST: {      SDValue Src = Op.getOperand(0);      MVT SrcVT = Src.getSimpleValueType(); @@ -41795,7 +41843,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(      SDValue Op0 = Op.getOperand(0);      SDValue Op1 = Op.getOperand(1); -    unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue(); +    unsigned ShAmt = Op1->getAsZExtVal();      if (ShAmt >= BitWidth)        break; @@ -42580,7 +42628,7 @@ static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {    APInt Imm(SrcVT.getVectorNumElements(), 0);    for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {      SDValue In = Op.getOperand(Idx); -    if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1)) +    if (!In.isUndef() && (In->getAsZExtVal() & 0x1))        Imm.setBit(Idx);    }    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth()); @@ -49931,18 +49979,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,      SDValue Ptr = Ld->getBasePtr();      SDValue Chain = Ld->getChain();      for (SDNode *User : Chain->uses()) { -      if (User != N && +      auto *UserLd = dyn_cast<MemSDNode>(User); +      if (User != N && UserLd &&            (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||             User->getOpcode() == X86ISD::VBROADCAST_LOAD ||             ISD::isNormalLoad(User)) && -          cast<MemSDNode>(User)->getChain() == Chain && -          !User->hasAnyUseOfValue(1) && +          UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&            User->getValueSizeInBits(0).getFixedValue() >                RegVT.getFixedSizeInBits()) {          if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && -            cast<MemSDNode>(User)->getBasePtr() == Ptr && -            cast<MemSDNode>(User)->getMemoryVT().getSizeInBits() == -                MemVT.getSizeInBits()) { +            UserLd->getBasePtr() == Ptr && +            UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) {            SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),                                               RegVT.getSizeInBits());            Extract = DAG.getBitcast(RegVT, Extract); @@ -49961,7 +50008,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,          // See if we are loading a constant that matches in the lower          // bits of a longer constant (but from a different constant pool ptr).          EVT UserVT = User->getValueType(0); -        SDValue UserPtr = cast<MemSDNode>(User)->getBasePtr(); +        SDValue UserPtr = UserLd->getBasePtr();          const Constant *LdC = getTargetConstantFromBasePtr(Ptr);          const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);          if (LdC && UserC && UserPtr != Ptr) { @@ -53258,7 +53305,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,    if (Index.getOpcode() == ISD::ADD &&        Index.getValueType().getVectorElementType() == PtrVT &&        isa<ConstantSDNode>(Scale)) { -    uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue(); +    uint64_t ScaleAmt = Scale->getAsZExtVal();      if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {        BitVector UndefElts;        if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) { @@ -54572,6 +54619,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,            Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())          return Op0.getOperand(0);      } + +    // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x)) +    if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() && +        !X86::mayFoldLoad(Op0.getOperand(0), Subtarget)) +      return DAG.getNode(Op0.getOpcode(), DL, VT, +                         DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, +                                     Op0.getOperand(0), Op0.getOperand(0)), +                         Op0.getOperand(1));    }    // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128. @@ -54979,6 +55034,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,                             ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));        }        break; +    case X86ISD::BLENDI: +      if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) { +        uint64_t Mask0 = Ops[0].getConstantOperandVal(2); +        uint64_t Mask1 = Ops[1].getConstantOperandVal(2); +        uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0; +        MVT MaskSVT = MVT::getIntegerVT(VT.getVectorNumElements()); +        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); +        SDValue Sel = +            DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT)); +        return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1), +                             ConcatSubOperand(VT, Ops, 0)); +      } +      break;      case ISD::VSELECT:        if (!IsSplat && Subtarget.hasAVX512() &&            (VT.is256BitVector() || @@ -57602,7 +57670,7 @@ X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {  }  Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { -  if (ML->isInnermost() && +  if (ML && ML->isInnermost() &&        ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())      return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);    return TargetLowering::getPrefLoopAlignment(); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h index 9bd1622cb0d3..32745400a38b 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h @@ -1714,16 +1714,6 @@ namespace llvm {        MachineBasicBlock *Entry,        const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; -    bool splitValueIntoRegisterParts( -        SelectionDAG & DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, -        unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) -        const override; - -    SDValue joinRegisterPartsIntoValue( -        SelectionDAG & DAG, const SDLoc &DL, const SDValue *Parts, -        unsigned NumParts, MVT PartVT, EVT ValueVT, -        std::optional<CallingConv::ID> CC) const override; -      bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;      bool mayBeEmittedAsTailCall(const CallInst *CI) const override; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index b8b5421b9005..d75bd4171fde 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -127,6 +127,9 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,      return getRegisterTypeForCallingConv(Context, CC,                                           VT.changeVectorElementType(MVT::f16)); +  if (VT == MVT::bf16) +    return MVT::f16; +    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);  } @@ -421,40 +424,6 @@ unsigned X86TargetLowering::getJumpTableEncoding() const {    return TargetLowering::getJumpTableEncoding();  } -bool X86TargetLowering::splitValueIntoRegisterParts( -    SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, -    unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { -  bool IsABIRegCopy = CC.has_value(); -  EVT ValueVT = Val.getValueType(); -  if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { -    unsigned ValueBits = ValueVT.getSizeInBits(); -    unsigned PartBits = PartVT.getSizeInBits(); -    Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); -    Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); -    Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); -    Parts[0] = Val; -    return true; -  } -  return false; -} - -SDValue X86TargetLowering::joinRegisterPartsIntoValue( -    SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, -    MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const { -  bool IsABIRegCopy = CC.has_value(); -  if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { -    unsigned ValueBits = ValueVT.getSizeInBits(); -    unsigned PartBits = PartVT.getSizeInBits(); -    SDValue Val = Parts[0]; - -    Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); -    Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); -    Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); -    return Val; -  } -  return SDValue(); -} -  bool X86TargetLowering::useSoftFloat() const {    return Subtarget.useSoftFloat();  } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td index c3a673f97d34..fe7d90fbcdf7 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrAVX512.td @@ -448,7 +448,7 @@ multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,                                     X86VectorVTInfo< 2, EltVT64, VR128X>,                                     X86VectorVTInfo< 4, EltVT64, VR256X>,                                     null_frag, vinsert128_insert, sched>, -                                   VEX_W1X, EVEX_V256; +                                   EVEX_V256, REX_W;    // Even with DQI we'd like to only use these instructions for masking.    let Predicates = [HasDQI] in { @@ -750,7 +750,7 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,                                   X86VectorVTInfo< 4, EltVT64, VR256X>,                                   X86VectorVTInfo< 2, EltVT64, VR128X>,                                   null_frag, vextract128_extract, SchedRR, SchedMR>, -                                     VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>; +                                    EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W;    // Even with DQI we'd like to only use these instructions for masking.    let Predicates = [HasDQI] in { @@ -1161,7 +1161,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,  defm VBROADCASTSS  : avx512_fp_broadcast_ss<0x18, "vbroadcastss",                                         avx512vl_f32_info>;  defm VBROADCASTSD  : avx512_fp_broadcast_sd<0x19, "vbroadcastsd", -                                       avx512vl_f64_info>, VEX_W1X; +                                       avx512vl_f64_info>, REX_W;  multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,                                      X86VectorVTInfo _, SDPatternOperator OpNode, @@ -1267,7 +1267,7 @@ defm VPBROADCASTW  : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw",  defm VPBROADCASTD  : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd",                                             avx512vl_i32_info, HasAVX512, 1>;  defm VPBROADCASTQ  : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", -                                           avx512vl_i64_info, HasAVX512, 1>, VEX_W1X; +                                           avx512vl_i64_info, HasAVX512, 1>, REX_W;  multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,                                        SDPatternOperator OpNode, @@ -1460,11 +1460,11 @@ let Predicates = [HasBF16, HasVLX] in  let Predicates = [HasVLX, HasDQI] in {  defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", -                           X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X, -                           EVEX_V256, EVEX_CD8<64, CD8VT2>; +                           X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, +                           EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W;  defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", -                           X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X, -                           EVEX_V256, EVEX_CD8<64, CD8VT2>; +                           X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, +                           EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W;  // Patterns for selects of bitcasted operations.  def : Pat<(vselect_mask VK4WM:$mask, @@ -3185,15 +3185,13 @@ defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;  multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,                         X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload, -                       X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd, -                       bit NoRMPattern = 0, +                       X86SchedWriteMoveLS Sched, bit NoRMPattern = 0,                         SDPatternOperator SelectOprr = vselect> {    let hasSideEffects = 0 in {    let isMoveReg = 1 in    def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], -                    _.ExeDomain>, EVEX, Sched<[Sched.RR]>, -                    EVEX2VEXOverride<EVEX2VEXOvrd#"rr">; +                    _.ExeDomain>, EVEX, Sched<[Sched.RR]>;    def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),                        (ins _.KRCWM:$mask,  _.RC:$src),                        !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", @@ -3209,8 +3207,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,                      !if(NoRMPattern, [],                          [(set _.RC:$dst,                            (_.VT (ld_frag addr:$src)))]), -                    _.ExeDomain>, EVEX, Sched<[Sched.RM]>, -                    EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; +                    _.ExeDomain>, EVEX, Sched<[Sched.RM]>;    let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {      def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), @@ -3253,53 +3250,48 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,  multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,                                   AVX512VLVectorVTInfo _, Predicate prd,                                   X86SchedWriteMoveLSWidths Sched, -                                 string EVEX2VEXOvrd, bit NoRMPattern = 0> { +                                 bit NoRMPattern = 0> {    let Predicates = [prd] in    defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512,                         _.info512.AlignedLdFrag, masked_load_aligned, -                       Sched.ZMM, "", NoRMPattern>, EVEX_V512; +                       Sched.ZMM, NoRMPattern>, EVEX_V512;    let Predicates = [prd, HasVLX] in {    defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256,                            _.info256.AlignedLdFrag, masked_load_aligned, -                          Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256; +                          Sched.YMM, NoRMPattern>, EVEX_V256;    defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128,                            _.info128.AlignedLdFrag, masked_load_aligned, -                          Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128; +                          Sched.XMM, NoRMPattern>, EVEX_V128;    }  }  multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,                            AVX512VLVectorVTInfo _, Predicate prd,                            X86SchedWriteMoveLSWidths Sched, -                          string EVEX2VEXOvrd, bit NoRMPattern = 0, +                          bit NoRMPattern = 0,                            SDPatternOperator SelectOprr = vselect> {    let Predicates = [prd] in    defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag, -                       masked_load, Sched.ZMM, "", -                       NoRMPattern, SelectOprr>, EVEX_V512; +                       masked_load, Sched.ZMM, NoRMPattern, SelectOprr>, EVEX_V512;    let Predicates = [prd, HasVLX] in {    defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag, -                         masked_load, Sched.YMM, EVEX2VEXOvrd#"Y", -                         NoRMPattern, SelectOprr>, EVEX_V256; +                         masked_load, Sched.YMM, NoRMPattern, SelectOprr>, EVEX_V256;    defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag, -                         masked_load, Sched.XMM, EVEX2VEXOvrd, -                         NoRMPattern, SelectOprr>, EVEX_V128; +                         masked_load, Sched.XMM, NoRMPattern, SelectOprr>, EVEX_V128;    }  }  multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,                          X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, -                        X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd, -                        bit NoMRPattern = 0> { +                        X86SchedWriteMoveLS Sched, bit NoMRPattern = 0> {    let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {    let isMoveReg = 1 in    def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),                           OpcodeStr # "\t{$src, $dst|$dst, $src}",                           [], _.ExeDomain>, EVEX, -                         Sched<[Sched.RR]>, -                         EVEX2VEXOverride<EVEX2VEXOvrd#"rr_REV">; +                         Sched<[Sched.RR]>;    def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),                           (ins _.KRCWM:$mask, _.RC:$src),                           OpcodeStr # "\t{$src, ${dst} {${mask}}|"# @@ -3319,8 +3311,7 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),                      !if(NoMRPattern, [],                          [(st_frag (_.VT _.RC:$src), addr:$dst)]), -                    _.ExeDomain>, EVEX, Sched<[Sched.MR]>, -                    EVEX2VEXOverride<EVEX2VEXOvrd#"mr">; +                    _.ExeDomain>, EVEX, Sched<[Sched.MR]>;    def mrk : AVX512PI<opc, MRMDestMem, (outs),                       (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),                OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", @@ -3344,102 +3335,92 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName,  multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,                              AVX512VLVectorVTInfo _, Predicate prd,                              X86SchedWriteMoveLSWidths Sched, -                            string EVEX2VEXOvrd, bit NoMRPattern = 0> { +                            bit NoMRPattern = 0> {    let Predicates = [prd] in    defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store, -                        masked_store, Sched.ZMM, "", -                        NoMRPattern>, EVEX_V512; +                        masked_store, Sched.ZMM, NoMRPattern>, EVEX_V512;    let Predicates = [prd, HasVLX] in {      defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store, -                             masked_store, Sched.YMM, -                             EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256; +                             masked_store, Sched.YMM, NoMRPattern>, EVEX_V256;      defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store, -                             masked_store, Sched.XMM, EVEX2VEXOvrd, -                             NoMRPattern>, EVEX_V128; +                             masked_store, Sched.XMM, NoMRPattern>, EVEX_V128;    }  }  multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,                                    AVX512VLVectorVTInfo _, Predicate prd,                                    X86SchedWriteMoveLSWidths Sched, -                                  string EVEX2VEXOvrd, bit NoMRPattern = 0> { +                                  bit NoMRPattern = 0> {    let Predicates = [prd] in    defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore, -                        masked_store_aligned, Sched.ZMM, "", -                        NoMRPattern>, EVEX_V512; +                        masked_store_aligned, Sched.ZMM, NoMRPattern>, EVEX_V512;    let Predicates = [prd, HasVLX] in {      defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore, -                             masked_store_aligned, Sched.YMM, -                             EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256; +                             masked_store_aligned, Sched.YMM, NoMRPattern>, EVEX_V256;      defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore, -                             masked_store_aligned, Sched.XMM, EVEX2VEXOvrd, -                             NoMRPattern>, EVEX_V128; +                             masked_store_aligned, Sched.XMM, NoMRPattern>, EVEX_V128;    }  }  defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, -                                     HasAVX512, SchedWriteFMoveLS, "VMOVAPS">, +                                     HasAVX512, SchedWriteFMoveLS>,                 avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, -                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPS">, +                                      HasAVX512, SchedWriteFMoveLS>,                 TB, EVEX_CD8<32, CD8VF>;  defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, -                                     HasAVX512, SchedWriteFMoveLS, "VMOVAPD">, +                                     HasAVX512, SchedWriteFMoveLS>,                 avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, -                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPD">, +                                      HasAVX512, SchedWriteFMoveLS>,                 TB, PD, REX_W, EVEX_CD8<64, CD8VF>;  defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, -                              SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>, +                              SchedWriteFMoveLS, 0, null_frag>,                 avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512, -                               SchedWriteFMoveLS, "VMOVUPS">, +                               SchedWriteFMoveLS>,                                 TB, EVEX_CD8<32, CD8VF>;  defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, -                              SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>, +                              SchedWriteFMoveLS, 0, null_frag>,                 avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512, -                               SchedWriteFMoveLS, "VMOVUPD">, +                               SchedWriteFMoveLS>,                 TB, PD, REX_W, EVEX_CD8<64, CD8VF>;  defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, -                                       HasAVX512, SchedWriteVecMoveLS, -                                       "VMOVDQA", 1>, +                                       HasAVX512, SchedWriteVecMoveLS, 1>,                   avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, -                                        HasAVX512, SchedWriteVecMoveLS, -                                        "VMOVDQA", 1>, +                                        HasAVX512, SchedWriteVecMoveLS, 1>,                   TB, PD, EVEX_CD8<32, CD8VF>;  defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, -                                       HasAVX512, SchedWriteVecMoveLS, -                                       "VMOVDQA">, +                                       HasAVX512, SchedWriteVecMoveLS>,                   avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, -                                        HasAVX512, SchedWriteVecMoveLS, -                                        "VMOVDQA">, +                                        HasAVX512, SchedWriteVecMoveLS>,                   TB, PD, REX_W, EVEX_CD8<64, CD8VF>;  defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, -                               SchedWriteVecMoveLS, "VMOVDQU", 1>, +                               SchedWriteVecMoveLS, 1>,                  avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI, -                                SchedWriteVecMoveLS, "VMOVDQU", 1>, +                                SchedWriteVecMoveLS, 1>,                  TB, XD, EVEX_CD8<8, CD8VF>;  defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, -                                SchedWriteVecMoveLS, "VMOVDQU", 1>, +                                SchedWriteVecMoveLS, 1>,                   avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI, -                                 SchedWriteVecMoveLS, "VMOVDQU", 1>, +                                 SchedWriteVecMoveLS, 1>,                   TB, XD, REX_W, EVEX_CD8<16, CD8VF>;  defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, -                                SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>, +                                SchedWriteVecMoveLS, 1, null_frag>,                   avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512, -                                 SchedWriteVecMoveLS, "VMOVDQU", 1>, +                                 SchedWriteVecMoveLS, 1>,                   TB, XS, EVEX_CD8<32, CD8VF>;  defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, -                                SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>, +                                SchedWriteVecMoveLS, 0, null_frag>,                   avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512, -                                 SchedWriteVecMoveLS, "VMOVDQU">, +                                 SchedWriteVecMoveLS>,                   TB, XS, REX_W, EVEX_CD8<64, CD8VF>;  // Special instructions to help with spilling when we don't have VLX. We need @@ -4844,8 +4825,7 @@ defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,  defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,                                      SchedWriteVecIMul, HasBWI, 1>;  defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, -                                    SchedWriteVecIMul, HasDQI, 1>, T8, -                                    NotEVEX2VEXConvertible; +                                    SchedWriteVecIMul, HasDQI, 1>, T8;  defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul,                                      HasBWI, 1>;  defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul, @@ -4989,8 +4969,7 @@ defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,  defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax,                                      SchedWriteVecALU, HasAVX512, 1>, T8;  defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax, -                                    SchedWriteVecALU, HasAVX512, 1>, T8, -                                    NotEVEX2VEXConvertible; +                                    SchedWriteVecALU, HasAVX512, 1>, T8;  defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,                                      SchedWriteVecALU, HasBWI, 1>; @@ -4999,8 +4978,7 @@ defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,  defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax,                                      SchedWriteVecALU, HasAVX512, 1>, T8;  defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax, -                                    SchedWriteVecALU, HasAVX512, 1>, T8, -                                    NotEVEX2VEXConvertible; +                                    SchedWriteVecALU, HasAVX512, 1>, T8;  defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,                                      SchedWriteVecALU, HasBWI, 1>, T8; @@ -5009,8 +4987,7 @@ defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,  defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin,                                      SchedWriteVecALU, HasAVX512, 1>, T8;  defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin, -                                    SchedWriteVecALU, HasAVX512, 1>, T8, -                                    NotEVEX2VEXConvertible; +                                    SchedWriteVecALU, HasAVX512, 1>, T8;  defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,                                      SchedWriteVecALU, HasBWI, 1>; @@ -5019,8 +4996,7 @@ defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,  defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,                                      SchedWriteVecALU, HasAVX512, 1>, T8;  defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin, -                                    SchedWriteVecALU, HasAVX512, 1>, T8, -                                    NotEVEX2VEXConvertible; +                                    SchedWriteVecALU, HasAVX512, 1>, T8;  // PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.  let Predicates = [HasDQI, NoVLX] in { @@ -5405,8 +5381,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo  }  multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,                                  SDNode OpNode, SDNode VecNode, SDNode SaeNode, -                                X86FoldableSchedWrite sched, bit IsCommutable, -                                string EVEX2VexOvrd> { +                                X86FoldableSchedWrite sched, bit IsCommutable> {    let ExeDomain = _.ExeDomain in {    defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -5427,8 +5402,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,                           (ins _.FRC:$src1, _.FRC:$src2),                            OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",                            [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, -                          Sched<[sched]>, -                          EVEX2VEXOverride<EVEX2VexOvrd#"rr"> { +                          Sched<[sched]> {      let isCommutable = IsCommutable;    }    def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5436,8 +5410,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",                           [(set _.FRC:$dst, (OpNode _.FRC:$src1,                           (_.ScalarLdFrag addr:$src2)))]>, -                         Sched<[sched.Folded, sched.ReadAfterFold]>, -                         EVEX2VEXOverride<EVEX2VexOvrd#"rm">; +                         Sched<[sched.Folded, sched.ReadAfterFold]>;    }    let Uses = [MXCSR] in @@ -5474,19 +5447,15 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,                                SDNode VecNode, SDNode SaeNode,                                X86SchedWriteSizes sched, bit IsCommutable> {    defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode, -                              VecNode, SaeNode, sched.PS.Scl, IsCommutable, -                              NAME#"SS">, +                              VecNode, SaeNode, sched.PS.Scl, IsCommutable>,                                TB, XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;    defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode, -                              VecNode, SaeNode, sched.PD.Scl, IsCommutable, -                              NAME#"SD">, +                              VecNode, SaeNode, sched.PD.Scl, IsCommutable>,                                TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;    let Predicates = [HasFP16] in {      defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode, -                                VecNode, SaeNode, sched.PH.Scl, IsCommutable, -                                NAME#"SH">, -                                T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, -                                NotEVEX2VEXConvertible; +                                VecNode, SaeNode, sched.PH.Scl, IsCommutable>, +                                T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;    }  }  defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds, @@ -5506,14 +5475,13 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs,  // X86fminc and X86fmaxc instead of X86fmin and X86fmax  multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,                                      X86VectorVTInfo _, SDNode OpNode, -                                    X86FoldableSchedWrite sched, -                                    string EVEX2VEXOvrd> { +                                    X86FoldableSchedWrite sched> {    let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {    def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),                           (ins _.FRC:$src1, _.FRC:$src2),                            OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",                            [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, -                          Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> { +                          Sched<[sched]> {      let isCommutable = 1;    }    def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5521,36 +5489,34 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",                           [(set _.FRC:$dst, (OpNode _.FRC:$src1,                           (_.ScalarLdFrag addr:$src2)))]>, -                         Sched<[sched.Folded, sched.ReadAfterFold]>, -                         EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; +                         Sched<[sched.Folded, sched.ReadAfterFold]>;    }  }  defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc, -                                         SchedWriteFCmp.Scl, "VMINCSS">, TB, XS, +                                         SchedWriteFCmp.Scl>, TB, XS,                                           EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;  defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc, -                                         SchedWriteFCmp.Scl, "VMINCSD">, TB, XD, +                                         SchedWriteFCmp.Scl>, TB, XD,                                           REX_W, EVEX, VVVV, VEX_LIG,                                           EVEX_CD8<64, CD8VT1>, SIMD_EXC;  defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc, -                                         SchedWriteFCmp.Scl, "VMAXCSS">, TB, XS, +                                         SchedWriteFCmp.Scl>, TB, XS,                                           EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;  defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, -                                         SchedWriteFCmp.Scl, "VMAXCSD">, TB, XD, +                                         SchedWriteFCmp.Scl>, TB, XD,                                           REX_W, EVEX, VVVV, VEX_LIG,                                           EVEX_CD8<64, CD8VT1>, SIMD_EXC;  defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc, -                                         SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5, XS, -                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, -                                         NotEVEX2VEXConvertible; +                                         SchedWriteFCmp.Scl>, T_MAP5, XS, +                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC; +  defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc, -                                         SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5, XS, -                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, -                                         NotEVEX2VEXConvertible; +                                         SchedWriteFCmp.Scl>, T_MAP5, XS, +                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC;  multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,                              SDPatternOperator MaskOpNode, @@ -5820,8 +5786,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr                                     EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6, PD;    }  } -defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", -                                    SchedWriteFAdd>, NotEVEX2VEXConvertible; +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", SchedWriteFAdd>;  //===----------------------------------------------------------------------===//  // AVX-512  VPTESTM instructions @@ -5985,11 +5950,9 @@ multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,  multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,                                string OpcodeStr, SDNode OpNode, -                              X86SchedWriteWidths sched, -                              bit NotEVEX2VEXConvertibleQ = 0> { +                              X86SchedWriteWidths sched> {    defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32,                                avx512vl_i32_info, HasAVX512>; -  let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in    defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64,                                avx512vl_i64_info, HasAVX512>, REX_W;    defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16, @@ -6034,11 +5997,9 @@ multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM,  multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,                                 Format ImmFormR, Format ImmFormM,                                 string OpcodeStr, SDNode OpNode, -                               X86SchedWriteWidths sched, -                               bit NotEVEX2VEXConvertibleQ = 0> { +                               X86SchedWriteWidths sched> {    defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,                                   sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -  let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in    defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,                                   sched, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, REX_W;  } @@ -6054,7 +6015,7 @@ defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,                                  SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;  defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai, -                                 SchedWriteVecShiftImm, 1>, +                                 SchedWriteVecShiftImm>,               avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,                                  SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV; @@ -6066,7 +6027,7 @@ defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,  defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl,                                  SchedWriteVecShift>;  defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, -                                SchedWriteVecShift, 1>; +                                SchedWriteVecShift>;  defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,                                  SchedWriteVecShift>; @@ -6435,7 +6396,7 @@ defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,                                 avx512vl_i32_info>;  let ExeDomain = SSEPackedDouble in  defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, -                               avx512vl_i64_info>, VEX_W1X; +                               avx512vl_i64_info>, REX_W;  //===----------------------------------------------------------------------===//  // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW @@ -8443,9 +8404,9 @@ multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo    }    let Predicates = [HasDQI, HasVLX] in {      defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode, -                               MaskOpNode, sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible; +                               MaskOpNode, sched.XMM>, EVEX_V128;      defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode, -                               MaskOpNode, sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible; +                               MaskOpNode, sched.YMM>, EVEX_V256;    }  } @@ -8524,11 +8485,10 @@ multiclass avx512_cvtqq2ps_dq2ph<bits<8> opc, string OpcodeStr, SDPatternOperato      defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info128, null_frag,                                 null_frag, sched.XMM, _src.info128.BroadcastStr,                                 "{x}", i128mem, _src.info128.KRCWM>, -                               EVEX_V128, NotEVEX2VEXConvertible; +                               EVEX_V128;      defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info256, OpNode,                                 MaskOpNode, sched.YMM, _src.info256.BroadcastStr, -                               "{y}">, EVEX_V256, -                               NotEVEX2VEXConvertible; +                               "{y}">, EVEX_V256;      // Special patterns to allow use of X86VM[SU]intToFP for masking. Instruction      // patterns have been disabled with null_frag. @@ -10882,8 +10842,7 @@ defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info,  multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,                                            X86FoldableSchedWrite sched,                                            X86VectorVTInfo _, -                                          X86VectorVTInfo CastInfo, -                                          string EVEX2VEXOvrd> { +                                          X86VectorVTInfo CastInfo> {    let ExeDomain = _.ExeDomain in {    defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),                    (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), @@ -10891,7 +10850,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,                    (_.VT (bitconvert                           (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2,                                                    (i8 timm:$src3)))))>, -                  Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">; +                  Sched<[sched]>;    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),                  (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -10900,8 +10859,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,                    (CastInfo.VT (X86Shuf128 _.RC:$src1,                                             (CastInfo.LdFrag addr:$src2),                                             (i8 timm:$src3)))))>, -                Sched<[sched.Folded, sched.ReadAfterFold]>, -                EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; +                Sched<[sched.Folded, sched.ReadAfterFold]>;    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),                      (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),                      OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1", @@ -10918,45 +10876,40 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,  multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched,                                     AVX512VLVectorVTInfo _, -                                   AVX512VLVectorVTInfo CastInfo, bits<8> opc, -                                   string EVEX2VEXOvrd>{ +                                   AVX512VLVectorVTInfo CastInfo, bits<8> opc>{    let Predicates = [HasAVX512] in    defm Z : avx512_shuff_packed_128_common<opc, OpcodeStr, sched, -                                          _.info512, CastInfo.info512, "">, EVEX_V512; +                                          _.info512, CastInfo.info512>, EVEX_V512;    let Predicates = [HasAVX512, HasVLX] in    defm Z256 : avx512_shuff_packed_128_common<opc, OpcodeStr, sched, -                                             _.info256, CastInfo.info256, -                                             EVEX2VEXOvrd>, EVEX_V256; +                                             _.info256, CastInfo.info256>, EVEX_V256;  }  defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256, -      avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; +      avx512vl_f32_info, avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;  defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256, -      avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; +      avx512vl_f64_info, avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;  defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256, -      avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; +      avx512vl_i32_info, avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;  defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256, -      avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; +      avx512vl_i64_info, avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;  multiclass avx512_valign<bits<8> opc, string OpcodeStr,                           X86FoldableSchedWrite sched, X86VectorVTInfo _>{ -  // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the -  // instantiation of this class.    let ExeDomain = _.ExeDomain in {    defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),                    (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),                    OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",                    (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>, -                  Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">; +                  Sched<[sched]>;    defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),                  (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),                  OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",                  (_.VT (X86VAlign _.RC:$src1,                                   (bitconvert (_.LdFrag addr:$src2)),                                   (i8 timm:$src3)))>, -                Sched<[sched.Folded, sched.ReadAfterFold]>, -                EVEX2VEXOverride<"VPALIGNRrmi">; +                Sched<[sched.Folded, sched.ReadAfterFold]>;    defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),                     (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), @@ -10979,7 +10932,6 @@ multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,      defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>,                                  AVX512AIi8Base, EVEX, VVVV, EVEX_V128;      // We can't really override the 256-bit version so change it back to unset. -    let EVEX2VEXOverride = ? in      defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>,                                  AVX512AIi8Base, EVEX, VVVV, EVEX_V256;    } @@ -11111,7 +11063,7 @@ let Predicates = [HasVLX, HasBWI] in {  defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw",                  SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>, -                EVEX_CD8<8, CD8VF>, NotEVEX2VEXConvertible; +                EVEX_CD8<8, CD8VF>;  multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,                             X86FoldableSchedWrite sched, X86VectorVTInfo _> { @@ -13088,12 +13040,10 @@ multiclass avx512_cvtqq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo    let Predicates = [HasFP16, HasVLX] in {      defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v2i64x_info,                                 null_frag, null_frag, sched.XMM, "{1to2}", "{x}", -                               i128mem, VK2WM>, -                               EVEX_V128, NotEVEX2VEXConvertible; +                               i128mem, VK2WM>, EVEX_V128;      defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v4i64x_info,                                 null_frag, null_frag, sched.YMM, "{1to4}", "{y}", -                               i256mem, VK4WM>, -                               EVEX_V256, NotEVEX2VEXConvertible; +                               i256mem, VK4WM>, EVEX_V256;    }    def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}", diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td index 6b0c1b8c28c9..5cfa95e085e3 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -71,24 +71,60 @@ multiclass Mul<bits<8> o, string m, Format RegMRM, Format MemMRM, SDPatternOpera    // FIXME: Used for 8-bit mul, ignore result upper 8 bits.    // This probably ought to be moved to a def : Pat<> if the    // syntax can be accepted. -  let Defs = [AL,EFLAGS,AX], Uses = [AL] in -  def 8r : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, -                  [(set AL, (node AL, GR8:$src1)), (implicit EFLAGS)]>; -  let Defs = [AX,DX,EFLAGS], Uses = [AX] in -  def 16r : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16; -  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in -  def 32r : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32; -  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in -  def 64r : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>; -  let Defs = [AL,EFLAGS,AX], Uses = [AL] in -  def 8m : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, -                  [(set AL, (node AL, (loadi8 addr:$src1))), (implicit EFLAGS)]>; -  let Defs = [AX,DX,EFLAGS], Uses = [AX] in -  def 16m : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16; -  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in -  def 32m : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32; -  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in -  def 64m : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>; +  let Defs = [AL, EFLAGS, AX], Uses = [AL] in +    def 8r : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, +                       [(set AL, (node AL, GR8:$src1)), (implicit EFLAGS)]>; +  let Defs = [AX, DX, EFLAGS], Uses = [AX] in +    def 16r : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16; +  let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in +    def 32r : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32; +  let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in +    def 64r : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>; +  let Defs = [AL, EFLAGS, AX], Uses = [AL] in +    def 8m : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, +                       [(set AL, (node AL, (loadi8 addr:$src1))), (implicit EFLAGS)]>; +  let Defs = [AX, DX, EFLAGS], Uses = [AX] in +    def 16m : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16; +  let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in +    def 32m : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32; +  let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in +    def 64m : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>; + +  let Predicates = [In64BitMode] in { +    let Defs = [AL, AX], Uses = [AL] in +      def 8r_NF : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, []>, NF; +    let Defs = [AX, DX], Uses = [AX] in +      def 16r_NF : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, NF, PD; +    let Defs = [EAX, EDX], Uses = [EAX] in +      def 32r_NF : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, NF; +    let Defs = [RAX, RDX], Uses = [RAX] in +      def 64r_NF : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>, NF; +    let Defs = [AL, AX], Uses = [AL] in +      def 8m_NF : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, []>, NF; +    let Defs = [AX, DX], Uses = [AX] in +      def 16m_NF : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, NF, PD; +    let Defs = [EAX, EDX], Uses = [EAX] in +      def 32m_NF : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, NF; +    let Defs = [RAX, RDX], Uses = [RAX] in +      def 64m_NF : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, NF; + +    let Defs = [AL, EFLAGS, AX], Uses = [AL] in +      def 8r_EVEX : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, []>, PL; +    let Defs = [AX, DX, EFLAGS], Uses = [AX] in +      def 16r_EVEX : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, PL, PD; +    let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in +      def 32r_EVEX : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, PL; +    let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in +      def 64r_EVEX : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>, PL; +    let Defs = [AL, EFLAGS, AX], Uses = [AL] in +      def 8m_EVEX : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, []>, PL; +    let Defs = [AX, DX, EFLAGS], Uses = [AX] in +      def 16m_EVEX : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, PL, PD; +    let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in +      def 32m_EVEX : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, PL; +    let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in +      def 64m_EVEX : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, PL; +  }  }  defm MUL : Mul<0xF7, "mul", MRM4r, MRM4m, mul>; @@ -99,137 +135,341 @@ multiclass Div<bits<8> o, string m, Format RegMRM, Format MemMRM> {    defvar sched16 = !if(!eq(m, "div"), WriteDiv16, WriteIDiv16);    defvar sched32 = !if(!eq(m, "div"), WriteDiv32, WriteIDiv32);    defvar sched64 = !if(!eq(m, "div"), WriteDiv64, WriteIDiv64); -  let Defs = [AL,AH,EFLAGS], Uses = [AX] in -  def 8r  : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>; -  let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in -  def 16r : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16; -  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in -  def 32r : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32; -  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in -  def 64r : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>; -  let Defs = [AL,AH,EFLAGS], Uses = [AX] in -  def 8m  : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>; -  let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in -  def 16m : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16; -  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in -  def 32m : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32; -  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in -  def 64m : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>; +  let Defs = [AL, AH, EFLAGS], Uses = [AX] in +    def 8r  : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>; +  let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in +    def 16r : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16; +  let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in +    def 32r : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32; +  let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in +    def 64r : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>; +  let Defs = [AL, AH, EFLAGS], Uses = [AX] in +    def 8m  : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>; +  let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in +    def 16m : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16; +  let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in +    def 32m : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32; +  let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in +    def 64m : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>; + +  let Predicates = [In64BitMode] in { +    let Defs = [AL, AH], Uses = [AX] in +      def 8r_NF  : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>, NF; +    let Defs = [AX, DX], Uses = [AX, DX] in +      def 16r_NF : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, NF, PD; +    let Defs = [EAX, EDX], Uses = [EAX, EDX] in +      def 32r_NF : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, NF; +    let Defs = [RAX, RDX], Uses = [RAX, RDX] in +      def 64r_NF : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>, NF; +    let Defs = [AL, AH], Uses = [AX] in +      def 8m_NF  : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>, NF; +    let Defs = [AX, DX], Uses = [AX, DX] in +      def 16m_NF : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, NF, PD; +    let Defs = [EAX, EDX], Uses = [EAX, EDX] in +      def 32m_NF : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, NF; +    let Defs = [RAX, RDX], Uses = [RAX, RDX] in +      def 64m_NF : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, NF; + +    let Defs = [AL, AH, EFLAGS], Uses = [AX] in +      def 8r_EVEX  : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>, PL; +    let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in +      def 16r_EVEX : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, PL, PD; +    let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in +      def 32r_EVEX : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, PL; +    let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in +      def 64r_EVEX : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>, PL; +    let Defs = [AL, AH, EFLAGS], Uses = [AX] in +      def 8m_EVEX  : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>, PL; +    let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in +      def 16m_EVEX : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, PL, PD; +    let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in +      def 32m_EVEX : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, PL; +    let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in +      def 64m_EVEX : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, PL; +  }  } +  let hasSideEffects = 1 in { // so that we don't speculatively execute -defm DIV: Div<0xF7, "div", MRM6r, MRM6m>; -defm IDIV: Div<0xF7, "idiv", MRM7r, MRM7m>; +  defm DIV: Div<0xF7, "div", MRM6r, MRM6m>; +  defm IDIV: Div<0xF7, "idiv", MRM7r, MRM7m>;  } -class IMulOpRR<X86TypeInfo t, X86FoldableSchedWrite sched> -  : BinOpRR_RF<0xAF, "imul", t, X86smul_flag>, TB { +class IMulOpRR_R<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0> +  : BinOpRR_R<0xAF, "imul", t, ndd> {    let Form = MRMSrcReg;    let SchedRW = [sched];    // X = IMUL Y, Z --> X = IMUL Z, Y    let isCommutable = 1;  } -class IMulOpRM<X86TypeInfo t, X86FoldableSchedWrite sched> -  : BinOpRM_RF<0xAF, "imul", t, X86smul_flag>, TB { -let Form = MRMSrcMem; -let SchedRW = [sched.Folded, sched.ReadAfterFold]; +class IMulOpRR_RF<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0> +  : BinOpRR_RF<0xAF, "imul", t, X86smul_flag, ndd> { +  let Form = MRMSrcReg; +  let SchedRW = [sched]; +  // X = IMUL Y, Z --> X = IMUL Z, Y +  let isCommutable = 1; +} +class IMulOpRM_R<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0> +  : BinOpRM_R<0xAF, "imul", t, ndd> { +  let Form = MRMSrcMem; +  let SchedRW = [sched.Folded, sched.ReadAfterFold]; +} +class IMulOpRM_RF<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0> +  : BinOpRM_RF<0xAF, "imul", t, X86smul_flag, ndd> { +  let Form = MRMSrcMem; +  let SchedRW = [sched.Folded, sched.ReadAfterFold]; +} + +let Predicates = [NoNDD] in { +  def IMUL16rr : IMulOpRR_RF<Xi16, WriteIMul16Reg>, TB, OpSize16; +  def IMUL32rr : IMulOpRR_RF<Xi32, WriteIMul32Reg>, TB, OpSize32; +  def IMUL64rr : IMulOpRR_RF<Xi64, WriteIMul64Reg>, TB; +  def IMUL16rm : IMulOpRM_RF<Xi16, WriteIMul16Reg>, TB, OpSize16; +  def IMUL32rm : IMulOpRM_RF<Xi32, WriteIMul32Reg>, TB, OpSize32; +  def IMUL64rm : IMulOpRM_RF<Xi64, WriteIMul64Reg>, TB; +} +let Predicates = [HasNDD, In64BitMode] in { +  def IMUL16rr_ND : IMulOpRR_RF<Xi16, WriteIMul16Reg, 1>, PD; +  def IMUL32rr_ND : IMulOpRR_RF<Xi32, WriteIMul32Reg, 1>; +  def IMUL64rr_ND : IMulOpRR_RF<Xi64, WriteIMul64Reg, 1>; +  def IMUL16rm_ND : IMulOpRM_RF<Xi16, WriteIMul16Reg, 1>, PD; +  def IMUL32rm_ND : IMulOpRM_RF<Xi32, WriteIMul32Reg, 1>; +  def IMUL64rm_ND : IMulOpRM_RF<Xi64, WriteIMul64Reg, 1>;  } -def IMUL16rr : IMulOpRR<Xi16, WriteIMul16Reg>, OpSize16; -def IMUL32rr : IMulOpRR<Xi32, WriteIMul32Reg>, OpSize32; -def IMUL64rr : IMulOpRR<Xi64, WriteIMul64Reg>; -def IMUL16rm : IMulOpRM<Xi16, WriteIMul16Reg>, OpSize16; -def IMUL32rm : IMulOpRM<Xi32, WriteIMul32Reg>, OpSize32; -def IMUL64rm : IMulOpRM<Xi64, WriteIMul64Reg>; +let Predicates = [In64BitMode], Pattern = [(null_frag)] in { +  def IMUL16rr_NF : IMulOpRR_R<Xi16, WriteIMul16Reg>, NF, PD; +  def IMUL32rr_NF : IMulOpRR_R<Xi32, WriteIMul32Reg>, NF; +  def IMUL64rr_NF : IMulOpRR_R<Xi64, WriteIMul64Reg>, NF; +  def IMUL16rm_NF : IMulOpRM_R<Xi16, WriteIMul16Reg>, NF, PD; +  def IMUL32rm_NF : IMulOpRM_R<Xi32, WriteIMul32Reg>, NF; +  def IMUL64rm_NF : IMulOpRM_R<Xi64, WriteIMul64Reg>, NF; + +  def IMUL16rr_NF_ND : IMulOpRR_R<Xi16, WriteIMul16Reg, 1>, EVEX_NF, PD; +  def IMUL32rr_NF_ND : IMulOpRR_R<Xi32, WriteIMul32Reg, 1>, EVEX_NF; +  def IMUL64rr_NF_ND : IMulOpRR_R<Xi64, WriteIMul64Reg, 1>, EVEX_NF; +  def IMUL16rm_NF_ND : IMulOpRM_R<Xi16, WriteIMul16Reg, 1>, EVEX_NF, PD; +  def IMUL32rm_NF_ND : IMulOpRM_R<Xi32, WriteIMul32Reg, 1>, EVEX_NF; +  def IMUL64rm_NF_ND : IMulOpRM_R<Xi64, WriteIMul64Reg, 1>, EVEX_NF; + +  def IMUL16rr_EVEX : IMulOpRR_RF<Xi16, WriteIMul16Reg>, PL, PD; +  def IMUL32rr_EVEX : IMulOpRR_RF<Xi32, WriteIMul32Reg>, PL; +  def IMUL64rr_EVEX : IMulOpRR_RF<Xi64, WriteIMul64Reg>, PL; +  def IMUL16rm_EVEX : IMulOpRM_RF<Xi16, WriteIMul16Reg>, PL, PD; +  def IMUL32rm_EVEX : IMulOpRM_RF<Xi32, WriteIMul32Reg>, PL; +  def IMUL64rm_EVEX : IMulOpRM_RF<Xi64, WriteIMul64Reg>, PL; +}  class IMulOpRI8_R<X86TypeInfo t, X86FoldableSchedWrite sched>    : BinOpRI8<0x6B, "imul", binop_ndd_args, t, MRMSrcReg, -             (outs t.RegClass:$dst)>, DefEFLAGS { +             (outs t.RegClass:$dst)> {    let SchedRW = [sched];  }  class IMulOpRI_R<X86TypeInfo t, X86FoldableSchedWrite sched>    : BinOpRI<0x69, "imul", binop_ndd_args, t, MRMSrcReg, +            (outs t.RegClass:$dst), []> { +  let SchedRW = [sched]; +} +class IMulOpRI_RF<X86TypeInfo t, X86FoldableSchedWrite sched> +  : BinOpRI<0x69, "imul", binop_ndd_args, t, MRMSrcReg,              (outs t.RegClass:$dst),              [(set t.RegClass:$dst, EFLAGS, (X86smul_flag t.RegClass:$src1,               t.ImmNoSuOperator:$src2))]>, DefEFLAGS {    let SchedRW = [sched];  }  class IMulOpMI8_R<X86TypeInfo t, X86FoldableSchedWrite sched> -  : BinOpMI8<"imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)>, -    DefEFLAGS { +  : BinOpMI8<"imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)> {    let Opcode = 0x6B;    let SchedRW = [sched.Folded];  }  class IMulOpMI_R<X86TypeInfo t, X86FoldableSchedWrite sched>    : BinOpMI<0x69, "imul", binop_ndd_args, t, MRMSrcMem, +            (outs t.RegClass:$dst), []> { +  let SchedRW = [sched.Folded]; +} +class IMulOpMI_RF<X86TypeInfo t, X86FoldableSchedWrite sched> +  : BinOpMI<0x69, "imul", binop_ndd_args, t, MRMSrcMem,              (outs t.RegClass:$dst),              [(set t.RegClass:$dst, EFLAGS, (X86smul_flag (t.LoadNode addr:$src1),               t.ImmNoSuOperator:$src2))]>,      DefEFLAGS {    let SchedRW = [sched.Folded];  } -def IMUL16rri8 : IMulOpRI8_R<Xi16, WriteIMul16Imm>, OpSize16; -def IMUL32rri8 : IMulOpRI8_R<Xi32, WriteIMul32Imm>, OpSize32; -def IMUL64rri8 : IMulOpRI8_R<Xi64, WriteIMul64Imm>; -def IMUL16rri  : IMulOpRI_R<Xi16, WriteIMul16Imm>, OpSize16; -def IMUL32rri  : IMulOpRI_R<Xi32, WriteIMul32Imm>, OpSize32; -def IMUL64rri32 : IMulOpRI_R<Xi64, WriteIMul64Imm>; - -def IMUL16rmi8 : IMulOpMI8_R<Xi16, WriteIMul16Imm>, OpSize16; -def IMUL32rmi8 : IMulOpMI8_R<Xi32, WriteIMul32Imm>, OpSize32; -def IMUL64rmi8 : IMulOpMI8_R<Xi64, WriteIMul64Imm>; -def IMUL16rmi  : IMulOpMI_R<Xi16, WriteIMul16Imm>, OpSize16; -def IMUL32rmi  : IMulOpMI_R<Xi32, WriteIMul32Imm>, OpSize32; -def IMUL64rmi32 : IMulOpMI_R<Xi64, WriteIMul64Imm>; - +def IMUL16rri8 : IMulOpRI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, OpSize16; +def IMUL32rri8 : IMulOpRI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, OpSize32; +def IMUL64rri8 : IMulOpRI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS; +def IMUL16rri  : IMulOpRI_RF<Xi16, WriteIMul16Imm>, OpSize16; +def IMUL32rri  : IMulOpRI_RF<Xi32, WriteIMul32Imm>, OpSize32; +def IMUL64rri32 : IMulOpRI_RF<Xi64, WriteIMul64Imm>; +def IMUL16rmi8 : IMulOpMI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, OpSize16; +def IMUL32rmi8 : IMulOpMI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, OpSize32; +def IMUL64rmi8 : IMulOpMI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS; +def IMUL16rmi  : IMulOpMI_RF<Xi16, WriteIMul16Imm>, OpSize16; +def IMUL32rmi  : IMulOpMI_RF<Xi32, WriteIMul32Imm>, OpSize32; +def IMUL64rmi32 : IMulOpMI_RF<Xi64, WriteIMul64Imm>; + +let Predicates = [In64BitMode] in { +  def IMUL16rri8_NF : IMulOpRI8_R<Xi16, WriteIMul16Imm>, NF, PD; +  def IMUL32rri8_NF : IMulOpRI8_R<Xi32, WriteIMul32Imm>, NF; +  def IMUL64rri8_NF : IMulOpRI8_R<Xi64, WriteIMul64Imm>, NF; +  def IMUL16rri_NF  : IMulOpRI_R<Xi16, WriteIMul16Imm>, NF, PD; +  def IMUL32rri_NF  : IMulOpRI_R<Xi32, WriteIMul32Imm>, NF; +  def IMUL64rri32_NF : IMulOpRI_R<Xi64, WriteIMul64Imm>, NF; +  def IMUL16rmi8_NF : IMulOpMI8_R<Xi16, WriteIMul16Imm>, NF, PD; +  def IMUL32rmi8_NF : IMulOpMI8_R<Xi32, WriteIMul32Imm>, NF; +  def IMUL64rmi8_NF : IMulOpMI8_R<Xi64, WriteIMul64Imm>, NF; +  def IMUL16rmi_NF  : IMulOpMI_R<Xi16, WriteIMul16Imm>, NF, PD; +  def IMUL32rmi_NF  : IMulOpMI_R<Xi32, WriteIMul32Imm>, NF; +  def IMUL64rmi32_NF : IMulOpMI_R<Xi64, WriteIMul64Imm>, NF; + +  def IMUL16rri8_EVEX : IMulOpRI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, PL, PD; +  def IMUL32rri8_EVEX : IMulOpRI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, PL; +  def IMUL64rri8_EVEX : IMulOpRI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS, PL; +  def IMUL16rri_EVEX  : IMulOpRI_RF<Xi16, WriteIMul16Imm>, PL, PD; +  def IMUL32rri_EVEX  : IMulOpRI_RF<Xi32, WriteIMul32Imm>, PL; +  def IMUL64rri32_EVEX : IMulOpRI_RF<Xi64, WriteIMul64Imm>, PL; +  def IMUL16rmi8_EVEX : IMulOpMI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, PL, PD; +  def IMUL32rmi8_EVEX : IMulOpMI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, PL; +  def IMUL64rmi8_EVEX : IMulOpMI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS, PL; +  def IMUL16rmi_EVEX  : IMulOpMI_RF<Xi16, WriteIMul16Imm>, PL, PD; +  def IMUL32rmi_EVEX  : IMulOpMI_RF<Xi32, WriteIMul32Imm>, PL; +  def IMUL64rmi32_EVEX : IMulOpMI_RF<Xi64, WriteIMul64Imm>, PL; +}  //===----------------------------------------------------------------------===//  // INC and DEC Instructions  // -class IncOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag> { +class IncOpR_RF<X86TypeInfo t, bit ndd = 0> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag, ndd> {    let Pattern = [(set t.RegClass:$dst, EFLAGS,                   (X86add_flag_nocf t.RegClass:$src1, 1))];  } -class DecOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag> { +class DecOpR_RF<X86TypeInfo t, bit ndd = 0> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag, ndd> {    let Pattern = [(set t.RegClass:$dst, EFLAGS,                   (X86sub_flag_nocf t.RegClass:$src1, 1))];  } -class IncOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> { +class IncOpR_R<X86TypeInfo t, bit ndd = 0> : UnaryOpR_R<0xFF, MRM0r, "inc", t, null_frag, ndd>; +class DecOpR_R<X86TypeInfo t, bit ndd = 0> : UnaryOpR_R<0xFF, MRM1r, "dec", t, null_frag, ndd>; +class IncOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> {    let Pattern = [(store (add (t.LoadNode addr:$src1), 1), addr:$src1),                   (implicit EFLAGS)];  } -class DecOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> { +class DecOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> {    let Pattern = [(store (add (t.LoadNode addr:$src1), -1), addr:$src1),                   (implicit EFLAGS)];  } +class IncOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xFF, MRM0m, "inc", t, null_frag> { +  let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), 1))]; +} +class DecOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xFF, MRM1m, "dec", t, null_frag> { +  let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), -1))]; +} +class IncOpM_M<X86TypeInfo t> : UnaryOpM_M<0xFF, MRM0m, "inc", t, null_frag>; +class DecOpM_M<X86TypeInfo t> : UnaryOpM_M<0xFF, MRM1m, "dec", t, null_frag>; +class IncOpM_R<X86TypeInfo t> : UnaryOpM_R<0xFF, MRM0m, "inc", t, null_frag>; +class DecOpM_R<X86TypeInfo t> : UnaryOpM_R<0xFF, MRM1m, "dec", t, null_frag>; +  // IncDec_Alt - Instructions like "inc reg" short forms.  // Short forms only valid in 32-bit mode. Selected during MCInst lowering.  class IncDec_Alt<bits<8> o, string m, X86TypeInfo t>    : UnaryOpR_RF<o, AddRegFrm, m, t, null_frag>, Requires<[Not64BitMode]>;  let isConvertibleToThreeAddress = 1 in { -def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16; -def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32; -def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16; -def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32; -def INC8r  : IncOpR_RF<Xi8>; -def INC16r : IncOpR_RF<Xi16>, OpSize16; -def INC32r : IncOpR_RF<Xi32>, OpSize32; -def INC64r : IncOpR_RF<Xi64>; -def DEC8r  : DecOpR_RF<Xi8>; -def DEC16r : DecOpR_RF<Xi16>, OpSize16; -def DEC32r : DecOpR_RF<Xi32>, OpSize32; -def DEC64r : DecOpR_RF<Xi64>; +  def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16; +  def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32; +  def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16; +  def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32; +  let Predicates = [NoNDD] in { +    def INC8r  : IncOpR_RF<Xi8>; +    def INC16r : IncOpR_RF<Xi16>, OpSize16; +    def INC32r : IncOpR_RF<Xi32>, OpSize32; +    def INC64r : IncOpR_RF<Xi64>; +    def DEC8r  : DecOpR_RF<Xi8>; +    def DEC16r : DecOpR_RF<Xi16>, OpSize16; +    def DEC32r : DecOpR_RF<Xi32>, OpSize32; +    def DEC64r : DecOpR_RF<Xi64>; +  } +  let Predicates = [HasNDD, In64BitMode] in { +    def INC8r_ND  : IncOpR_RF<Xi8, 1>; +    def INC16r_ND : IncOpR_RF<Xi16, 1>, PD; +    def INC32r_ND : IncOpR_RF<Xi32, 1>; +    def INC64r_ND : IncOpR_RF<Xi64, 1>; +    def DEC8r_ND  : DecOpR_RF<Xi8, 1>; +    def DEC16r_ND : DecOpR_RF<Xi16, 1>, PD; +    def DEC32r_ND : DecOpR_RF<Xi32, 1>; +    def DEC64r_ND : DecOpR_RF<Xi64, 1>; +  } +  let Predicates = [In64BitMode], Pattern = [(null_frag)] in { +    def INC8r_NF  : IncOpR_R<Xi8>, NF; +    def INC16r_NF : IncOpR_R<Xi16>, NF, PD; +    def INC32r_NF : IncOpR_R<Xi32>, NF; +    def INC64r_NF : IncOpR_R<Xi64>, NF; +    def DEC8r_NF  : DecOpR_R<Xi8>, NF; +    def DEC16r_NF : DecOpR_R<Xi16>, NF, PD; +    def DEC32r_NF : DecOpR_R<Xi32>, NF; +    def DEC64r_NF : DecOpR_R<Xi64>, NF; +    def INC8r_NF_ND  : IncOpR_R<Xi8, 1>, NF; +    def INC16r_NF_ND : IncOpR_R<Xi16, 1>, NF, PD; +    def INC32r_NF_ND : IncOpR_R<Xi32, 1>, NF; +    def INC64r_NF_ND : IncOpR_R<Xi64, 1>, NF; +    def DEC8r_NF_ND  : DecOpR_R<Xi8, 1>, NF; +    def DEC16r_NF_ND : DecOpR_R<Xi16, 1>, NF, PD; +    def DEC32r_NF_ND : DecOpR_R<Xi32, 1>, NF; +    def DEC64r_NF_ND : DecOpR_R<Xi64, 1>, NF; +    def INC8r_EVEX  : IncOpR_RF<Xi8>, PL; +    def INC16r_EVEX : IncOpR_RF<Xi16>, PL, PD; +    def INC32r_EVEX : IncOpR_RF<Xi32>, PL; +    def INC64r_EVEX : IncOpR_RF<Xi64>, PL; +    def DEC8r_EVEX  : DecOpR_RF<Xi8>, PL; +    def DEC16r_EVEX : DecOpR_RF<Xi16>, PL, PD; +    def DEC32r_EVEX : DecOpR_RF<Xi32>, PL; +    def DEC64r_EVEX : DecOpR_RF<Xi64>, PL; +  }  }  let Predicates = [UseIncDec] in { -def INC8m  : IncOpM_M<Xi8>; -def INC16m : IncOpM_M<Xi16>, OpSize16; -def INC32m : IncOpM_M<Xi32>, OpSize32; -def DEC8m  : DecOpM_M<Xi8>; -def DEC16m : DecOpM_M<Xi16>, OpSize16; -def DEC32m : DecOpM_M<Xi32>, OpSize32; +  def INC8m  : IncOpM_MF<Xi8>; +  def INC16m : IncOpM_MF<Xi16>, OpSize16; +  def INC32m : IncOpM_MF<Xi32>, OpSize32; +  def DEC8m  : DecOpM_MF<Xi8>; +  def DEC16m : DecOpM_MF<Xi16>, OpSize16; +  def DEC32m : DecOpM_MF<Xi32>, OpSize32;  }  let Predicates = [UseIncDec, In64BitMode] in { -def INC64m : IncOpM_M<Xi64>; -def DEC64m : DecOpM_M<Xi64>; +  def INC64m : IncOpM_MF<Xi64>; +  def DEC64m : DecOpM_MF<Xi64>; +} +let Predicates = [HasNDD, In64BitMode, UseIncDec] in { +  def INC8m_ND  : IncOpM_RF<Xi8>; +  def INC16m_ND : IncOpM_RF<Xi16>, PD; +  def INC32m_ND : IncOpM_RF<Xi32>; +  def DEC8m_ND  : DecOpM_RF<Xi8>; +  def DEC16m_ND : DecOpM_RF<Xi16>, PD; +  def DEC32m_ND : DecOpM_RF<Xi32>; +  def INC64m_ND : IncOpM_RF<Xi64>; +  def DEC64m_ND : DecOpM_RF<Xi64>; +} +let Predicates = [In64BitMode], Pattern = [(null_frag)] in { +  def INC8m_NF  : IncOpM_M<Xi8>, NF; +  def INC16m_NF : IncOpM_M<Xi16>, NF, PD; +  def INC32m_NF : IncOpM_M<Xi32>, NF; +  def INC64m_NF : IncOpM_M<Xi64>, NF; +  def DEC8m_NF  : DecOpM_M<Xi8>, NF; +  def DEC16m_NF : DecOpM_M<Xi16>, NF, PD; +  def DEC32m_NF : DecOpM_M<Xi32>, NF; +  def DEC64m_NF : DecOpM_M<Xi64>, NF; +  def INC8m_NF_ND  : IncOpM_R<Xi8>, NF; +  def INC16m_NF_ND : IncOpM_R<Xi16>, NF, PD; +  def INC32m_NF_ND : IncOpM_R<Xi32>, NF; +  def INC64m_NF_ND : IncOpM_R<Xi64>, NF; +  def DEC8m_NF_ND  : DecOpM_R<Xi8>, NF; +  def DEC16m_NF_ND : DecOpM_R<Xi16>, NF, PD; +  def DEC32m_NF_ND : DecOpM_R<Xi32>, NF; +  def DEC64m_NF_ND : DecOpM_R<Xi64>, NF; +  def INC8m_EVEX  : IncOpM_MF<Xi8>, PL; +  def INC16m_EVEX : IncOpM_MF<Xi16>, PL, PD; +  def INC32m_EVEX : IncOpM_MF<Xi32>, PL; +  def INC64m_EVEX : IncOpM_MF<Xi64>, PL; +  def DEC8m_EVEX  : DecOpM_MF<Xi8>, PL; +  def DEC16m_EVEX : DecOpM_MF<Xi16>, PL, PD; +  def DEC32m_EVEX : DecOpM_MF<Xi32>, PL; +  def DEC64m_EVEX : DecOpM_MF<Xi64>, PL;  }  //===----------------------------------------------------------------------===// @@ -350,212 +590,212 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,    let isCommutable = CommutableRR,        isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {      let Predicates = [NoNDD] in { -      def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; -      def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16; -      def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32; -      def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; +      def 8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; +      def 16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16; +      def 32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32; +      def 64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;      }      let Predicates = [HasNDD, In64BitMode] in { -      def NAME#8rr_ND  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>; -      def NAME#16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD; -      def NAME#32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>; -      def NAME#64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>; -      def NAME#8rr_NF_ND  : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF; -      def NAME#16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD; -      def NAME#32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF; -      def NAME#64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF; +      def 8rr_ND  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>; +      def 16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD; +      def 32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>; +      def 64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>; +      def 8rr_NF_ND  : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF; +      def 16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD; +      def 32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF; +      def 64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF;      }      let Predicates = [In64BitMode] in { -      def NAME#8rr_NF  : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF; -      def NAME#16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD; -      def NAME#32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF; -      def NAME#64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF; -      def NAME#8rr_EVEX  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; -      def NAME#16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; -      def NAME#32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL; -      def NAME#64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL; +      def 8rr_NF  : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF; +      def 16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD; +      def 32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF; +      def 64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF; +      def 8rr_EVEX  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; +      def 16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; +      def 32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL; +      def 64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;      }    } -    def NAME#8rr_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>; -    def NAME#16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; -    def NAME#32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; -    def NAME#64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>; +    def 8rr_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>; +    def 16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; +    def 32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; +    def 64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;      let Predicates = [In64BitMode] in { -      def NAME#8rr_EVEX_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; -      def NAME#16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; -      def NAME#32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; -      def NAME#64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; -      def NAME#8rr_ND_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; -      def NAME#16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; -      def NAME#32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; -      def NAME#64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; -      def NAME#8rr_NF_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF; -      def NAME#16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD; -      def NAME#32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF; -      def NAME#64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF; -      def NAME#8rr_NF_ND_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; -      def NAME#16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; -      def NAME#32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; -      def NAME#64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; +      def 8rr_EVEX_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; +      def 16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; +      def 32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; +      def 64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; +      def 8rr_ND_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; +      def 16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; +      def 32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; +      def 64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; +      def 8rr_NF_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF; +      def 16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD; +      def 32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF; +      def 64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF; +      def 8rr_NF_ND_REV  : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; +      def 16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; +      def 32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; +      def 64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;      }      let Predicates = [NoNDD] in { -      def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; -      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16; -      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32; -      def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; +      def 8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; +      def 16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16; +      def 32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32; +      def 64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;      }      let Predicates = [HasNDD, In64BitMode] in { -      def NAME#8rm_ND  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>; -      def NAME#16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD; -      def NAME#32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>; -      def NAME#64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>; -      def NAME#8rm_NF_ND  : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; -      def NAME#16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; -      def NAME#32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; -      def NAME#64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; +      def 8rm_ND  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>; +      def 16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD; +      def 32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>; +      def 64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>; +      def 8rm_NF_ND  : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; +      def 16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; +      def 32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; +      def 64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF;      }      let Predicates = [In64BitMode] in { -      def NAME#8rm_NF  : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF; -      def NAME#16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD; -      def NAME#32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF; -      def NAME#64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF; -      def NAME#8rm_EVEX  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL; -      def NAME#16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD; -      def NAME#32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL; -      def NAME#64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL; +      def 8rm_NF  : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF; +      def 16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD; +      def 32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF; +      def 64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF; +      def 8rm_EVEX  : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL; +      def 16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD; +      def 32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL; +      def 64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL;      }      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {        let Predicates = [NoNDD] in {          // NOTE: These are order specific, we want the ri8 forms to be listed          // first so that they are slightly preferred to the ri forms. -        def NAME#16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; -        def NAME#32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; -        def NAME#64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>; -        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; -        def NAME#16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16; -        def NAME#32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32; -        def NAME#64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>; +        def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; +        def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; +        def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>; +        def 8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; +        def 16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16; +        def 32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32; +        def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;        }        let Predicates = [HasNDD, In64BitMode] in { -        def NAME#16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; -        def NAME#32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>; -        def NAME#64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>; -        def NAME#8ri_ND   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>; -        def NAME#16ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD; -        def NAME#32ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>; -        def NAME#64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>; -        def NAME#16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; -        def NAME#32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; -        def NAME#64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; -        def NAME#8ri_NF_ND  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF; -        def NAME#16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; -        def NAME#32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; -        def NAME#64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; +        def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; +        def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>; +        def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>; +        def 8ri_ND   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>; +        def 16ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD; +        def 32ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>; +        def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>; +        def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; +        def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; +        def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; +        def 8ri_NF_ND  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF; +        def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; +        def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; +        def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;        }        let Predicates = [In64BitMode] in { -        def NAME#16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD; -        def NAME#32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF; -        def NAME#64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF; -        def NAME#8ri_NF  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF; -        def NAME#16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD; -        def NAME#32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF; -        def NAME#64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF; -        def NAME#16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; -        def NAME#32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL; -        def NAME#64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL; -        def NAME#8ri_EVEX   : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL; -        def NAME#16ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD; -        def NAME#32ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL; -        def NAME#64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL; +        def 16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD; +        def 32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF; +        def 64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF; +        def 8ri_NF  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF; +        def 16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD; +        def 32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF; +        def 64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF; +        def 16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; +        def 32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL; +        def 64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL; +        def 8ri_EVEX   : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL; +        def 16ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD; +        def 32ri_EVEX  : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL; +        def 64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL;        }      } -    def NAME#8mr    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>; -    def NAME#16mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; -    def NAME#32mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; -    def NAME#64mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>; +    def 8mr    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>; +    def 16mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; +    def 32mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; +    def 64mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;      let Predicates = [HasNDD, In64BitMode] in { -    def NAME#8mr_ND    : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>; -    def NAME#16mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; -    def NAME#32mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>; -    def NAME#64mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>; -    def NAME#8mr_NF_ND    : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF; -    def NAME#16mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD; -    def NAME#32mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF; -    def NAME#64mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF; +    def 8mr_ND    : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>; +    def 16mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; +    def 32mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>; +    def 64mr_ND   : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>; +    def 8mr_NF_ND    : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF; +    def 16mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD; +    def 32mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF; +    def 64mr_NF_ND   : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF;    }    let Predicates = [In64BitMode] in { -    def NAME#8mr_NF    : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF; -    def NAME#16mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD; -    def NAME#32mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF; -    def NAME#64mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF; -    def NAME#8mr_EVEX    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; -    def NAME#16mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; -    def NAME#32mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL; -    def NAME#64mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL; +    def 8mr_NF    : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF; +    def 16mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD; +    def 32mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF; +    def 64mr_NF   : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF; +    def 8mr_EVEX    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; +    def 16mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; +    def 32mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL; +    def 64mr_EVEX   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;    }    // NOTE: These are order specific, we want the mi8 forms to be listed    // first so that they are slightly preferred to the mi forms. -  def NAME#16mi8  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16; -  def NAME#32mi8  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32; +  def 16mi8  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16; +  def 32mi8  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32;    let Predicates = [In64BitMode] in -    def NAME#64mi8  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>; -  def NAME#8mi    : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>; -  def NAME#16mi   : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; -  def NAME#32mi   : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; +    def 64mi8  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>; +  def 8mi    : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>; +  def 16mi   : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; +  def 32mi   : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;    let Predicates = [In64BitMode] in -    def NAME#64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>; +    def 64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;    let Predicates = [HasNDD, In64BitMode] in { -    def NAME#16mi8_ND  : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD; -    def NAME#32mi8_ND  : BinOpMI8_RF<mnemonic, Xi32, MemMRM>; -    def NAME#64mi8_ND  : BinOpMI8_RF<mnemonic, Xi64, MemMRM>; -    def NAME#8mi_ND    : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>; -    def NAME#16mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD; -    def NAME#32mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>; -    def NAME#64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>; -    def NAME#16mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD; -    def NAME#32mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF; -    def NAME#64mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF; -    def NAME#8mi_NF_ND    : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF; -    def NAME#16mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD; -    def NAME#32mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF; -    def NAME#64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF; +    def 16mi8_ND  : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD; +    def 32mi8_ND  : BinOpMI8_RF<mnemonic, Xi32, MemMRM>; +    def 64mi8_ND  : BinOpMI8_RF<mnemonic, Xi64, MemMRM>; +    def 8mi_ND    : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>; +    def 16mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD; +    def 32mi_ND   : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>; +    def 64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>; +    def 16mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD; +    def 32mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF; +    def 64mi8_NF_ND  : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF; +    def 8mi_NF_ND    : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF; +    def 16mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD; +    def 32mi_NF_ND   : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF; +    def 64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF;    }    let Predicates = [In64BitMode] in { -    def NAME#16mi8_NF  : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD; -    def NAME#32mi8_NF  : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF; -    def NAME#64mi8_NF  : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF; -    def NAME#8mi_NF    : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF; -    def NAME#16mi_NF   : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD; -    def NAME#32mi_NF   : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF; -    def NAME#64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF; -    def NAME#16mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD; -    def NAME#32mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL; -    def NAME#64mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL; -    def NAME#8mi_EVEX    : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL; -    def NAME#16mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD; -    def NAME#32mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL; -    def NAME#64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL; +    def 16mi8_NF  : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD; +    def 32mi8_NF  : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF; +    def 64mi8_NF  : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF; +    def 8mi_NF    : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF; +    def 16mi_NF   : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD; +    def 32mi_NF   : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF; +    def 64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF; +    def 16mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD; +    def 32mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL; +    def 64mi8_EVEX  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL; +    def 8mi_EVEX    : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL; +    def 16mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD; +    def 32mi_EVEX   : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL; +    def 64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL;    }    // These are for the disassembler since 0x82 opcode behaves like 0x80, but    // not in 64-bit mode.    let Predicates = [Not64BitMode] in { -  def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; -  def NAME#8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; +  def 8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; +  def 8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;    } -  def NAME#8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL, +  def 8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,                              "{$src, %al|al, $src}">; -  def NAME#16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX, +  def 16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,                                "{$src, %ax|ax, $src}">, OpSize16; -  def NAME#32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX, +  def 32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,                                "{$src, %eax|eax, $src}">, OpSize32; -  def NAME#64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX, +  def 64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,                                "{$src, %rax|rax, $src}">;  } @@ -571,162 +811,162 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,                             bit ConvertibleToThreeAddress> {    let isCommutable = CommutableRR in {      let Predicates = [NoNDD] in { -      def NAME#8rr  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>; +      def 8rr  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { -        def NAME#16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; -        def NAME#32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; -        def NAME#64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>; +        def 16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; +        def 32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; +        def 64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>;        }      }      let Predicates = [HasNDD, In64BitMode] in { -      def NAME#8rr_ND  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>; +      def 8rr_ND  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>;        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { -        def NAME#16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD; -        def NAME#32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>; -        def NAME#64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>; +        def 16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD; +        def 32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>; +        def 64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>;        }      }    } // isCommutable    let Predicates = [In64BitMode] in { -    def NAME#8rr_EVEX  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; -    def NAME#16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; -    def NAME#32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL; -    def NAME#64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL; +    def 8rr_EVEX  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; +    def 16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; +    def 32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL; +    def 64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL;    } -  def NAME#8rr_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>; -  def NAME#16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; -  def NAME#32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; -  def NAME#64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>; +  def 8rr_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>; +  def 16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; +  def 32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; +  def 64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>;    let Predicates = [In64BitMode] in { -    def NAME#8rr_ND_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; -    def NAME#16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; -    def NAME#32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; -    def NAME#64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; -    def NAME#8rr_EVEX_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; -    def NAME#16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; -    def NAME#32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; -    def NAME#64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; +    def 8rr_ND_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; +    def 16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; +    def 32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; +    def 64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; +    def 8rr_EVEX_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; +    def 16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; +    def 32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; +    def 64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL;    }    let Predicates = [NoNDD] in { -    def NAME#8rm   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>; -    def NAME#16rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16; -    def NAME#32rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32; -    def NAME#64rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>; +    def 8rm   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>; +    def 16rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16; +    def 32rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32; +    def 64rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>;    }    let Predicates = [HasNDD, In64BitMode] in { -    def NAME#8rm_ND   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>; -    def NAME#16rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD; -    def NAME#32rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>; -    def NAME#64rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>; +    def 8rm_ND   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>; +    def 16rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD; +    def 32rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>; +    def 64rm_ND  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>;    }    let Predicates = [In64BitMode] in { -    def NAME#8rm_EVEX   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL; -    def NAME#16rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD; -    def NAME#32rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL; -    def NAME#64rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL; +    def 8rm_EVEX   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL; +    def 16rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD; +    def 32rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL; +    def 64rm_EVEX  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL;    }    let Predicates = [NoNDD] in { -    def NAME#8ri   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>; +    def 8ri   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>;      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {        // NOTE: These are order specific, we want the ri8 forms to be listed        // first so that they are slightly preferred to the ri forms. -      def NAME#16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; -      def NAME#32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; -      def NAME#64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>; +      def 16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; +      def 32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; +      def 64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>; -      def NAME#16ri  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; -      def NAME#32ri  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; -      def NAME#64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>; +      def 16ri  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; +      def 32ri  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; +      def 64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>;      }    }    let Predicates = [HasNDD, In64BitMode] in { -    def NAME#8ri_ND   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>; +    def 8ri_ND   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>;      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { -      def NAME#16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; -      def NAME#32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>; -      def NAME#64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>; -      def NAME#16ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD; -      def NAME#32ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>; -      def NAME#64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>; +      def 16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; +      def 32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>; +      def 64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>; +      def 16ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD; +      def 32ri_ND  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>; +      def 64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>;      }    }    let Predicates = [In64BitMode] in { -    def NAME#8ri_EVEX   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL; -    def NAME#16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; -    def NAME#32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL; -    def NAME#64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL; -    def NAME#16ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD; -    def NAME#32ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL; -    def NAME#64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL; +    def 8ri_EVEX   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL; +    def 16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; +    def 32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL; +    def 64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL; +    def 16ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD; +    def 32ri_EVEX  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL; +    def 64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL;    } -  def NAME#8mr    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>; -  def NAME#16mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; -  def NAME#32mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; -  def NAME#64mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>; +  def 8mr    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>; +  def 16mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; +  def 32mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; +  def 64mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;    let Predicates = [HasNDD, In64BitMode] in { -    def NAME#8mr_ND    : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>; -    def NAME#16mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; -    def NAME#32mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>; -    def NAME#64mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>; +    def 8mr_ND    : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>; +    def 16mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; +    def 32mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>; +    def 64mr_ND   : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>;    }    let Predicates = [In64BitMode] in { -    def NAME#8mr_EVEX    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; -    def NAME#16mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; -    def NAME#32mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL; -    def NAME#64mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL; +    def 8mr_EVEX    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; +    def 16mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; +    def 32mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL; +    def 64mr_EVEX   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL;    }    // NOTE: These are order specific, we want the mi8 forms to be listed    // first so that they are slightly preferred to the mi forms. -  def NAME#8mi    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>; -  def NAME#16mi8  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16; -  def NAME#32mi8  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32; +  def 8mi    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>; +  def 16mi8  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16; +  def 32mi8  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32;    let Predicates = [In64BitMode] in -    def NAME#64mi8  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>; -  def NAME#16mi   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; -  def NAME#32mi   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; +    def 64mi8  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>; +  def 16mi   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; +  def 32mi   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;    let Predicates = [In64BitMode] in -    def NAME#64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>; +    def 64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;    let Predicates = [HasNDD, In64BitMode] in { -    def NAME#8mi_ND    : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>; -    def NAME#16mi8_ND  : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD; -    def NAME#32mi8_ND  : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>; -    def NAME#64mi8_ND  : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>; -    def NAME#16mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD; -    def NAME#32mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>; -    def NAME#64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>; +    def 8mi_ND    : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>; +    def 16mi8_ND  : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD; +    def 32mi8_ND  : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>; +    def 64mi8_ND  : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>; +    def 16mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD; +    def 32mi_ND   : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>; +    def 64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>;    }    let Predicates = [In64BitMode] in { -    def NAME#8mi_EVEX    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL; -    def NAME#16mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD; -    def NAME#32mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL; -    def NAME#64mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL; -    def NAME#16mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD; -    def NAME#32mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL; -    def NAME#64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL; +    def 8mi_EVEX    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL; +    def 16mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD; +    def 32mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL; +    def 64mi8_EVEX  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL; +    def 16mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD; +    def 32mi_EVEX   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL; +    def 64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL;    }    // These are for the disassembler since 0x82 opcode behaves like 0x80, but    // not in 64-bit mode.    let Predicates = [Not64BitMode]  in { -    def NAME#8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; -  def NAME#8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; +    def 8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; +  def 8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;    } -  def NAME#8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL, +  def 8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,                               "{$src, %al|al, $src}">; -  def NAME#16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX, +  def 16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,                                 "{$src, %ax|ax, $src}">, OpSize16; -  def NAME#32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX, +  def 32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,                                 "{$src, %eax|eax, $src}">, OpSize32; -  def NAME#64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX, +  def 64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,                                 "{$src, %rax|rax, $src}">;  } @@ -739,71 +979,71 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,                          SDNode opnode, bit CommutableRR,                          bit ConvertibleToThreeAddress> {    let isCommutable = CommutableRR in { -  def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; +  def 8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { -    def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; -    def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; -    def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; +    def 16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; +    def 32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; +    def 64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;      } // isConvertibleToThreeAddress    } // isCommutable -  def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; -  def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; -  def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; -  def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; +  def 8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; +  def 16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; +  def 32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; +  def 64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; -  def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; -  def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16; -  def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32; -  def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; +  def 8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; +  def 16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16; +  def 32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32; +  def 64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; -  def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; +  def 8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {    // NOTE: These are order specific, we want the ri8 forms to be listed    // first so that they are slightly preferred to the ri forms. -  def NAME#16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16; -  def NAME#32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32; -  def NAME#64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>; +  def 16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16; +  def 32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32; +  def 64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>; -  def NAME#16ri  : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; -  def NAME#32ri  : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; -  def NAME#64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>; +  def 16ri  : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; +  def 32ri  : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; +  def 64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;    } -  def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; -  def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; -  def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; -  def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; +  def 8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; +  def 16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; +  def 32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; +  def 64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;    // NOTE: These are order specific, we want the mi8 forms to be listed    // first so that they are slightly preferred to the mi forms. -  def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16; -  def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32; +  def 16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16; +  def 32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32;    let Predicates = [In64BitMode] in -  def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>; +  def 64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>; -  def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; -  def NAME#16mi   : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; -  def NAME#32mi   : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; +  def 8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; +  def 16mi   : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; +  def 32mi   : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;    let Predicates = [In64BitMode] in -  def NAME#64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>; +  def 64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;    // These are for the disassembler since 0x82 opcode behaves like 0x80, but    // not in 64-bit mode.    let Predicates = [Not64BitMode] in { -  def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; +  def 8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;      let mayLoad = 1 in -    def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>; +    def 8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;    } -  def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, +  def 8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,                             "{$src, %al|al, $src}">; -  def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, +  def 16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,                             "{$src, %ax|ax, $src}">, OpSize16; -  def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, +  def 32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,                             "{$src, %eax|eax, $src}">, OpSize32; -  def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, +  def 64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,                             "{$src, %rax|rax, $src}">;  } @@ -1119,14 +1359,34 @@ defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W;  // We don't have patterns for these as there is no advantage over ADC for  // most code.  let Form = MRMSrcReg in { -def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD; -def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD; -def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS; -def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS; +  def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32>, T8, PD; +  def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64>, T8, PD; +  def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32>, T8, XS; +  def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64>, T8, XS; +  let Predicates =[In64BitMode] in { +    def ADCX32rr_EVEX : BinOpRRF_RF<0x66, "adcx", Xi32>, EVEX, T_MAP4, PD; +    def ADCX64rr_EVEX : BinOpRRF_RF<0x66, "adcx", Xi64>, EVEX, T_MAP4, PD; +    def ADOX32rr_EVEX : BinOpRRF_RF<0x66, "adox", Xi32>, EVEX, T_MAP4, XS; +    def ADOX64rr_EVEX : BinOpRRF_RF<0x66, "adox", Xi64>, EVEX, T_MAP4, XS; +    def ADCX32rr_ND : BinOpRRF_RF<0x66, "adcx", Xi32, null_frag, 1>, PD; +    def ADCX64rr_ND : BinOpRRF_RF<0x66, "adcx", Xi64, null_frag, 1>, PD; +    def ADOX32rr_ND : BinOpRRF_RF<0x66, "adox", Xi32, null_frag, 1>, XS; +    def ADOX64rr_ND : BinOpRRF_RF<0x66, "adox", Xi64, null_frag, 1>, XS; +  }  }  let Form = MRMSrcMem in { -def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD; -def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD; -def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS; -def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS; +  def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32>, T8, PD; +  def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64>, T8, PD; +  def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32>, T8, XS; +  def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64>, T8, XS; +  let Predicates =[In64BitMode] in { +    def ADCX32rm_EVEX : BinOpRMF_RF<0x66, "adcx", Xi32>, EVEX, T_MAP4, PD; +    def ADCX64rm_EVEX : BinOpRMF_RF<0x66, "adcx", Xi64>, EVEX, T_MAP4, PD; +    def ADOX32rm_EVEX : BinOpRMF_RF<0x66, "adox", Xi32>, EVEX, T_MAP4, XS; +    def ADOX64rm_EVEX : BinOpRMF_RF<0x66, "adox", Xi64>, EVEX, T_MAP4, XS; +    def ADCX32rm_ND : BinOpRMF_RF<0x66, "adcx", Xi32, null_frag, 1>, PD; +    def ADCX64rm_ND : BinOpRMF_RF<0x66, "adcx", Xi64, null_frag, 1>, PD; +    def ADOX32rm_ND : BinOpRMF_RF<0x66, "adox", Xi32, null_frag, 1>, XS; +    def ADOX64rm_ND : BinOpRMF_RF<0x66, "adox", Xi64, null_frag, 1>, XS; +  }  } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td index 6e76b44b66a3..8798b13a1761 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrFormats.td @@ -247,8 +247,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,    bit hasREPPrefix = 0;     // Does this inst have a REP prefix?    bits<2> OpEncBits = OpEnc.Value;    bit IgnoresW = 0;         // Does this inst ignore REX_W field? -  bit EVEX_W1_VEX_W0 = 0;   // This EVEX inst with VEX.W==1 can become a VEX -                            // instruction with VEX.W == 0.    bit hasVEX_4V = 0;        // Does this inst require the VEX.VVVV field?    bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?    bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit @@ -279,10 +277,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,                                       CD8_EltSize,                                       !srl(VectSize, CD8_Form{1-0}))), 0); -  // Used to prevent an explicit EVEX2VEX override for this instruction. -  string EVEX2VEXOverride = ?; - -  bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.    ExplicitOpPrefix explicitOpPrefix = NoExplicitOpPrefix;    bits<2> explicitOpPrefixBits = explicitOpPrefix.Value;    // TSFlags layout should be kept in sync with X86BaseInfo.h. diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h index eac8d79eb8a3..eb0734f9a618 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrInfo.h @@ -29,8 +29,10 @@ class X86Subtarget;  namespace X86 {  enum AsmComments { +  // For instr that was compressed from EVEX to LEGACY. +  AC_EVEX_2_LEGACY = MachineInstr::TAsmComments,    // For instr that was compressed from EVEX to VEX. -  AC_EVEX_2_VEX = MachineInstr::TAsmComments +  AC_EVEX_2_VEX = AC_EVEX_2_LEGACY << 1  };  /// Return a pair of condition code for the given predicate and whether diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMisc.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMisc.td index 305bd74f7bd7..97c625a64cfc 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMisc.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrMisc.td @@ -1212,36 +1212,33 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {                        (implicit EFLAGS)]>, TB, XS, Sched<[WriteTZCNTLd]>;  } -multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, -                  RegisterClass RC, X86MemOperand x86memop, -                  X86FoldableSchedWrite sched, string Suffix = ""> { -let hasSideEffects = 0 in { -  def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), -                    !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, -                  T8, VEX, VVVV, Sched<[sched]>; -  let mayLoad = 1 in -  def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), -                    !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, -                  T8, VEX, VVVV, Sched<[sched.Folded]>; -} +multiclass Bls<string m, Format RegMRM, Format MemMRM, X86TypeInfo t, string Suffix = ""> { +  let SchedRW = [WriteBLS] in { +    def rr#Suffix : UnaryOpR<0xF3, RegMRM, m, unaryop_ndd_args, t, +                             (outs t.RegClass:$dst), []>, T8, VVVV; +  } + +  let SchedRW = [WriteBLS.Folded] in +    def rm#Suffix : UnaryOpM<0xF3, MemMRM, m, unaryop_ndd_args, t, +                             (outs t.RegClass:$dst), []>, T8, VVVV;  } -let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in { -  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>; -  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, REX_W; -  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>; -  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, REX_W; -  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>; -  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, REX_W; +let Predicates = [HasBMI], Defs = [EFLAGS] in { +  defm BLSR32 : Bls<"blsr", MRM1r, MRM1m, Xi32>, VEX; +  defm BLSR64 : Bls<"blsr", MRM1r, MRM1m, Xi64>, VEX; +  defm BLSMSK32 : Bls<"blsmsk", MRM2r, MRM2m, Xi32>, VEX; +  defm BLSMSK64 : Bls<"blsmsk", MRM2r, MRM2m, Xi64>, VEX; +  defm BLSI32 : Bls<"blsi", MRM3r, MRM3m, Xi32>, VEX; +  defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64>, VEX;  } -let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in { -  defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX; -  defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX; -  defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX; -  defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX; -  defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX; -  defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX; +let Predicates = [HasBMI, In64BitMode], Defs = [EFLAGS] in { +  defm BLSR32 : Bls<"blsr", MRM1r, MRM1m, Xi32, "_EVEX">, EVEX; +  defm BLSR64 : Bls<"blsr", MRM1r, MRM1m, Xi64, "_EVEX">, EVEX; +  defm BLSMSK32 : Bls<"blsmsk", MRM2r, MRM2m, Xi32, "_EVEX">, EVEX; +  defm BLSMSK64 : Bls<"blsmsk", MRM2r, MRM2m, Xi64, "_EVEX">, EVEX; +  defm BLSI32 : Bls<"blsi", MRM3r, MRM3m, Xi32, "_EVEX">, EVEX; +  defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64, "_EVEX">, EVEX;  }  let Predicates = [HasBMI] in { @@ -1281,50 +1278,35 @@ let Predicates = [HasBMI] in {              (BLSI64rr GR64:$src)>;  } -multiclass bmi4VOp3_base<bits<8> opc, string mnemonic, RegisterClass RC, -                         X86MemOperand x86memop, SDPatternOperator OpNode, -                         PatFrag ld_frag, X86FoldableSchedWrite Sched, -                         string Suffix = ""> { -  def rr#Suffix : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), -                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), -                    [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>, -                  T8, VEX, Sched<[Sched]>; -let mayLoad = 1 in -  def rm#Suffix : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), -                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), -                    [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)), -                     (implicit EFLAGS)]>, T8, VEX, -                  Sched<[Sched.Folded, -                         // x86memop:$src1 -                         ReadDefault, ReadDefault, ReadDefault, ReadDefault, -                         ReadDefault, -                         // RC:$src2 -                         Sched.ReadAfterFold]>; +multiclass Bmi4VOp3<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, +                    X86FoldableSchedWrite sched, string Suffix = ""> { +  let SchedRW = [sched], Form = MRMSrcReg4VOp3 in +    def rr#Suffix : BinOpRR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst), +                            [(set t.RegClass:$dst, EFLAGS, +                             (node t.RegClass:$src1, t.RegClass:$src2))]>, T8; +  let SchedRW = [sched.Folded, +                 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, +                 sched.ReadAfterFold], Form =  MRMSrcMem4VOp3 in +    def rm#Suffix : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst), +                            [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1), +                             t.RegClass:$src2))]>, T8;  }  let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in { -  defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem, -                               X86bextr, loadi32, WriteBEXTR>; -  defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem, -                               X86bextr, loadi64, WriteBEXTR>, REX_W; +  defm BEXTR32 : Bmi4VOp3<0xF7, "bextr", Xi32, X86bextr, WriteBEXTR>, VEX; +  defm BEXTR64 : Bmi4VOp3<0xF7, "bextr", Xi64, X86bextr, WriteBEXTR>, VEX;  }  let Predicates = [HasBMI2, NoEGPR], Defs = [EFLAGS] in { -  defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem, -                              X86bzhi, loadi32, WriteBZHI>; -  defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem, -                              X86bzhi, loadi64, WriteBZHI>, REX_W; +  defm BZHI32 : Bmi4VOp3<0xF5, "bzhi", Xi32, X86bzhi, WriteBZHI>, VEX; +  defm BZHI64 : Bmi4VOp3<0xF5, "bzhi", Xi64, X86bzhi, WriteBZHI>, VEX;  } -let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in { -  defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem, -                               X86bextr, loadi32, WriteBEXTR, "_EVEX">, EVEX; -  defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem, -                               X86bextr, loadi64, WriteBEXTR, "_EVEX">, EVEX, REX_W; +let Predicates = [HasBMI, HasEGPR, In64BitMode], Defs = [EFLAGS] in { +  defm BEXTR32 : Bmi4VOp3<0xF7, "bextr", Xi32, X86bextr, WriteBEXTR, "_EVEX">, EVEX; +  defm BEXTR64 : Bmi4VOp3<0xF7, "bextr", Xi64, X86bextr, WriteBEXTR, "_EVEX">, EVEX;  } -let Predicates = [HasBMI2, HasEGPR], Defs = [EFLAGS] in { -  defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem, -                              X86bzhi, loadi32, WriteBZHI, "_EVEX">, EVEX; -  defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem, -                              X86bzhi, loadi64, WriteBZHI, "_EVEX">, EVEX, REX_W; +let Predicates = [HasBMI2, HasEGPR, In64BitMode], Defs = [EFLAGS] in { +  defm BZHI32 : Bmi4VOp3<0xF5, "bzhi", Xi32, X86bzhi, WriteBZHI, "_EVEX">, EVEX; +  defm BZHI64 : Bmi4VOp3<0xF5, "bzhi", Xi64, X86bzhi, WriteBZHI, "_EVEX">, EVEX;  }  def CountTrailingOnes : SDNodeXForm<imm, [{ @@ -1371,22 +1353,22 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,    def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),                      !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),                      [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>, -                  VEX, VVVV, Sched<[WriteALU]>; +                  NoCD8, VVVV, Sched<[WriteALU]>;    def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),                      !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),                      [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>, -                  VEX, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>; +                  NoCD8, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;  }  let Predicates = [HasBMI2, NoEGPR] in {    defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, -                               X86pdep, loadi32>, T8, XD; +                               X86pdep, loadi32>, T8, XD, VEX;    defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, -                               X86pdep, loadi64>, T8, XD, REX_W; +                               X86pdep, loadi64>, T8, XD, REX_W, VEX;    defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, -                               X86pext, loadi32>, T8, XS; +                               X86pext, loadi32>, T8, XS, VEX;    defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, -                               X86pext, loadi64>, T8, XS, REX_W; +                               X86pext, loadi64>, T8, XS, REX_W, VEX;  }  let Predicates = [HasBMI2, HasEGPR] in { diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrPredicates.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrPredicates.td index 94fa6e45ded9..cb751639a057 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrPredicates.td @@ -8,8 +8,41 @@  def TruePredicate : Predicate<"true">; +// Intel x86 instructions have three separate encoding spaces: legacy, VEX, and +// EVEX. Not all X86 instructions are extended for EGPR. The following is an +// overview of which instructions are extended and how we implement them. +// +// * Legacy space +//   All instructions in legacy maps 0 and 1 that have explicit GPR or memory +//   operands can use the REX2 prefix to access the EGPR, except XSAVE*/XRSTOR. +// +// * EVEX space +//   All instructions in the EVEX space can access the EGPR in their +//   register/memory operands. +// +// For the above intructions, the only difference in encoding is reflected in +// the REX2/EVEX prefix when EGPR is used, i.e. the opcode and opcode name are +// unchanged. We don’t add new entries in TD, and instead we extend GPR with +// R16-R31 and make them allocatable only when the feature EGPR is available. +// +// Besides, some instructions in legacy space with map 2/3 and VEX space are +// promoted into EVEX space. Encoding space changes after the promotion, opcode +// and opcode map may change too sometimes. For these instructions, we add new +// entries in TD to avoid overcomplicating the assembler and disassembler. +// +// HasEGPR is for the new entries and NoEGPR is for the entries before +// promotion, so that the promoted variant can be selected first to benefit RA.  def HasEGPR      : Predicate<"Subtarget->hasEGPR()">;  def NoEGPR       : Predicate<"!Subtarget->hasEGPR()">; + +// APX extends some instructions with a new form that has an extra register +// operand called a new data destination (NDD). In such forms, NDD is the new +// destination register receiving the result of the computation and all other +// operands (including the original destination operand) become read-only source +// operands. +// +// HasNDD is for the new NDD entries and NoNDD is for the legacy 2-address +// entries, so that the NDD variant can be selected first to benefit RA.  def HasNDD       : Predicate<"Subtarget->hasNDD()">;  def NoNDD        : Predicate<"!Subtarget->hasNDD()">;  def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td index d13e3b7af69a..f951894db189 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -868,7 +868,7 @@ let Predicates = [HasBMI2, NoEGPR] in {    defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, PD, REX_W;  } -let Predicates = [HasBMI2, HasEGPR] in { +let Predicates = [HasBMI2, HasEGPR, In64BitMode] in {    defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem, "_EVEX">, EVEX;    defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem, "_EVEX">, REX_W, EVEX;    defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8, XS, EVEX; diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td index 699e5847e63f..b1be4739617d 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSystem.td @@ -695,14 +695,14 @@ def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),                    Requires<[Not64BitMode, HasINVPCID]>;  def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),                    "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8, PD, -                  Requires<[In64BitMode, HasINVPCID]>; +                  Requires<[In64BitMode]>;  def INVPCID64_EVEX : I<0xF2, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),                         "invpcid\t{$src2, $src1|$src1, $src2}", []>, -                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasINVPCID]>; +                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>;  } // SchedRW -let Predicates = [In64BitMode, HasINVPCID] in { +let Predicates = [HasINVPCID, NoEGPR] in {    // The instruction can only use a 64 bit register as the register argument    // in 64 bit mode, while the intrinsic only accepts a 32 bit argument    // corresponding to it. @@ -714,6 +714,13 @@ let Predicates = [In64BitMode, HasINVPCID] in {                addr:$src2)>;  } +let Predicates = [HasINVPCID, HasEGPR] in { +  def : Pat<(int_x86_invpcid GR32:$src1, addr:$src2), +            (INVPCID64_EVEX +              (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src1), sub_32bit), +              addr:$src2)>; +} +  //===----------------------------------------------------------------------===//  // SMAP Instruction diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrUtils.td b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrUtils.td index da85922a018d..f4ae15837fbf 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86InstrUtils.td +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86InstrUtils.td @@ -43,8 +43,6 @@ class XOP { Encoding OpEnc = EncXOP; }  class VEX { Encoding OpEnc = EncVEX; }  class EVEX { Encoding OpEnc = EncEVEX; }  class WIG  { bit IgnoresW = 1; } -// Special version of REX_W that can be changed to VEX.W==0 for EVEX2VEX. -class VEX_W1X  { bit hasREX_W = 1; bit EVEX_W1_VEX_W0 = 1; }  class VEX_L  { bit hasVEX_L = 1; }  class VEX_LIG { bit ignoresVEX_L = 1; }  class VVVV { bit hasVEX_4V = 1; } @@ -66,9 +64,6 @@ class EVEX_CD8<int esize, CD8VForm form> {  }  class NoCD8 { bits<7> CD8_Scale = 0; } -class EVEX2VEXOverride<string VEXInstrName> { -  string EVEX2VEXOverride = VEXInstrName; -}  class AVX512BIi8Base : TB, PD {    Domain ExeDomain = SSEPackedInt;    ImmType ImmT = Imm8; @@ -89,7 +84,6 @@ class AVX512PDIi8Base : TB, PD {    Domain ExeDomain = SSEPackedDouble;    ImmType ImmT = Imm8;  } -class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }  class ExplicitREX2Prefix { ExplicitOpPrefix explicitOpPrefix = ExplicitREX2; }  class ExplicitVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitVEX; }  class ExplicitEVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitEVEX; } @@ -1005,7 +999,7 @@ class BinOpRR_RF_Rev<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>  }  // BinOpRRF_RF - Instructions that read "reg, reg", write "reg" and read/write  // EFLAGS. -class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0> +class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>    : BinOpRR<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),              [(set t.RegClass:$dst, EFLAGS,               (node t.RegClass:$src1, t.RegClass:$src2, @@ -1041,7 +1035,7 @@ class BinOpRM_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit               (t.LoadNode addr:$src2)))]>, DefEFLAGS, NDD<ndd>;  // BinOpRMF_RF - Instructions that read "reg, [mem]", write "reg" and read/write  // EFLAGS. -class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0> +class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>    : BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),              [(set t.RegClass:$dst, EFLAGS,               (node t.RegClass:$src1, (t.LoadNode addr:$src2), EFLAGS))]>, diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp index e1a67f61e766..133ee2041565 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -2055,10 +2055,11 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {      }    } -  // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that -  // are compressed from EVEX encoding to VEX encoding. +  // Add a comment about EVEX compression    if (TM.Options.MCOptions.ShowMCEncoding) { -    if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX) +    if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_LEGACY) +      OutStreamer->AddComment("EVEX TO LEGACY Compression ", false); +    else if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)        OutStreamer->AddComment("EVEX TO VEX Compression ", false);    } diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp index 5668b514d6de..b92bffbe6239 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -75,7 +75,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {    initializeGlobalISel(PR);    initializeWinEHStatePassPass(PR);    initializeFixupBWInstPassPass(PR); -  initializeEvexToVexInstPassPass(PR); +  initializeCompressEVEXPassPass(PR);    initializeFixupLEAPassPass(PR);    initializeFPSPass(PR);    initializeX86FixupSetCCPassPass(PR); @@ -575,7 +575,7 @@ void X86PassConfig::addPreEmitPass() {      addPass(createX86FixupInstTuning());      addPass(createX86FixupVectorConstants());    } -  addPass(createX86EvexToVexInsts()); +  addPass(createX86CompressEVEXPass());    addPass(createX86DiscriminateMemOpsPass());    addPass(createX86InsertPrefetchPass());    addPass(createX86InsertX87waitPass()); diff --git a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 49631f38017a..cd40b1d3b093 100644 --- a/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2232,6 +2232,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,    static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {      { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },      { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 }, +    { ISD::FP_EXTEND, MVT::v16f64,  MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4      { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },      { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd diff --git a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp index 05003ec304ad..1535eb622da6 100644 --- a/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/contrib/llvm-project/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -142,7 +142,7 @@ void XCoreDAGToDAGISel::Select(SDNode *N) {    switch (N->getOpcode()) {    default: break;    case ISD::Constant: { -    uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue(); +    uint64_t Val = N->getAsZExtVal();      if (immMskBitp(N)) {        // Transformation function: get the size of a mask        // Look for the first non-zero bit  | 
