diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2024-01-09 19:58:18 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2024-01-09 19:58:18 +0000 |
commit | aca2e42c67292825f835f094eb0c4df5ce6013db (patch) | |
tree | 9cfb7eeef35545100c4f7219e794e6a0306ea6a6 /llvm/lib/Target | |
parent | 77dbea07356e1ab2f37a777d4d1ddc5dd3e301c2 (diff) | |
download | src-aca2e42c67292825f835f094eb0c4df5ce6013db.tar.gz src-aca2e42c67292825f835f094eb0c4df5ce6013db.zip |
Diffstat (limited to 'llvm/lib/Target')
191 files changed, 5189 insertions, 2257 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h index 901769c54b6e..d20ef63a72e8 100644 --- a/llvm/lib/Target/AArch64/AArch64.h +++ b/llvm/lib/Target/AArch64/AArch64.h @@ -88,6 +88,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry &); void initializeAArch64GlobalsTaggingPass(PassRegistry &); void initializeAArch64LoadStoreOptPass(PassRegistry&); +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &); void initializeAArch64MIPeepholeOptPass(PassRegistry &); void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &); diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 68f452039c9b..d5e8ed101d1c 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -1405,7 +1405,7 @@ def ProcessorFeatures { FeatureSSBS]; list<SubtargetFeature> A78C = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeatureFullFP16, FeatureDotProd, - FeatureFlagM, FeatureFP16FML, FeaturePAuth, + FeatureFlagM, FeaturePAuth, FeaturePerfMon, FeatureRCPC, FeatureSPE, FeatureSSBS]; list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon, diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 9b8162ce8dd4..e98f6c4984a7 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -645,7 +645,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); TmpOffset += SL->getElementOffset(Idx); } else { - uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + uint64_t S = GTI.getSequentialElementStride(DL); while (true) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. @@ -1231,15 +1231,6 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, // Only extend the RHS within the instruction if there is a valid extend type. if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() && isValueAvailable(RHS)) { - if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) - if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) - if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) { - Register RHSReg = getRegForValue(SI->getOperand(0)); - if (!RHSReg) - return 0; - return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType, - C->getZExtValue(), SetFlags, WantResult); - } Register RHSReg = getRegForValue(RHS); if (!RHSReg) return 0; @@ -4987,15 +4978,13 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { if (Field) TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field); } else { - Type *Ty = GTI.getIndexedType(); - // If this is a constant subscript, handle it quickly. if (const auto *CI = dyn_cast<ConstantInt>(Idx)) { if (CI->isZero()) continue; // N = N + Offset - TotalOffs += - DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue(); + TotalOffs += GTI.getSequentialElementStride(DL) * + cast<ConstantInt>(CI)->getSExtValue(); continue; } if (TotalOffs) { @@ -5006,7 +4995,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { } // N = N + Idx * ElementSize; - uint64_t ElementSize = DL.getTypeAllocSize(Ty); + uint64_t ElementSize = GTI.getSequentialElementStride(DL); unsigned IdxN = getRegForGEPIndex(Idx); if (!IdxN) return false; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 476d99c2a7e0..edc8cc7d4d1e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -580,7 +580,7 @@ bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val, if (!isa<ConstantSDNode>(N.getNode())) return false; - uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue(); + uint64_t Immed = N.getNode()->getAsZExtVal(); unsigned ShiftAmt; if (Immed >> 12 == 0) { @@ -611,7 +611,7 @@ bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val, return false; // The immediate operand must be a 24-bit zero-extended immediate. - uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue(); + uint64_t Immed = N.getNode()->getAsZExtVal(); // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" // have the opposite effect on the C flag, so this pattern mustn't match under @@ -1326,7 +1326,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, // MOV X0, WideImmediate // LDR X2, [BaseReg, X0] if (isa<ConstantSDNode>(RHS)) { - int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue(); + int64_t ImmOff = (int64_t)RHS->getAsZExtVal(); // Skip the immediate can be selected by load/store addressing mode. // Also skip the immediate can be encoded by a single ADD (SUB is also // checked by using -ImmOff). diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 102fd0c3dae2..47e665176e8b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3588,8 +3588,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, // cmp w13, w12 // can be turned into: // cmp w12, w11, lsl #1 - if (!isa<ConstantSDNode>(RHS) || - !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) { + if (!isa<ConstantSDNode>(RHS) || !isLegalArithImmed(RHS->getAsZExtVal())) { SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS; if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) { @@ -3623,7 +3622,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && LHS.getNode()->hasNUsesOfValue(1, 0)) { - int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); + int16_t ValueofRHS = RHS->getAsZExtVal(); if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, @@ -5619,7 +5618,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else // must be calculated before hand. - uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); + uint64_t ScaleVal = Scale->getAsZExtVal(); if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); EVT IndexVT = Index.getValueType(); @@ -5707,7 +5706,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else // must be calculated before hand. - uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue(); + uint64_t ScaleVal = Scale->getAsZExtVal(); if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) { assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types"); EVT IndexVT = Index.getValueType(); @@ -16516,9 +16515,9 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) { if (Ext0.getOperand(0).getValueType().getVectorNumElements() != VT.getVectorNumElements() * 2) return SDValue(); - if ((Ext0.getConstantOperandVal(1) != 0 && + if ((Ext0.getConstantOperandVal(1) != 0 || Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) && - (Ext1.getConstantOperandVal(1) != 0 && + (Ext1.getConstantOperandVal(1) != 0 || Ext0.getConstantOperandVal(1) != VT.getVectorNumElements())) return SDValue(); unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP @@ -22011,7 +22010,7 @@ static SDValue performBRCONDCombine(SDNode *N, SDValue Cmp = N->getOperand(3); assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); - unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); + unsigned CC = CCVal->getAsZExtVal(); if (CC != AArch64CC::EQ && CC != AArch64CC::NE) return SDValue(); diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index cb63d8726744..10ad5b1f8f25 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -12586,6 +12586,7 @@ def : TokenAlias<".4S", ".4s">; def : TokenAlias<".2D", ".2d">; def : TokenAlias<".1Q", ".1q">; def : TokenAlias<".2H", ".2h">; +def : TokenAlias<".2B", ".2b">; def : TokenAlias<".B", ".b">; def : TokenAlias<".H", ".h">; def : TokenAlias<".S", ".s">; diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp new file mode 100644 index 000000000000..6fcd9c290e9c --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.cpp @@ -0,0 +1,828 @@ +//===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements a pass that recognizes certain loop idioms and +// transforms them into more optimized versions of the same loop. In cases +// where this happens, it can be a significant performance win. +// +// We currently only recognize one loop that finds the first mismatched byte +// in an array and returns the index, i.e. something like: +// +// while (++i != n) { +// if (a[i] != b[i]) +// break; +// } +// +// In this example we can actually vectorize the loop despite the early exit, +// although the loop vectorizer does not support it. It requires some extra +// checks to deal with the possibility of faulting loads when crossing page +// boundaries. However, even with these checks it is still profitable to do the +// transformation. +// +//===----------------------------------------------------------------------===// +// +// TODO List: +// +// * Add support for the inverse case where we scan for a matching element. +// * Permit 64-bit induction variable types. +// * Recognize loops that increment the IV *after* comparing bytes. +// * Allow 32-bit sign-extends of the IV used by the GEP. +// +//===----------------------------------------------------------------------===// + +#include "AArch64LoopIdiomTransform.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "aarch64-loop-idiom-transform" + +static cl::opt<bool> + DisableAll("disable-aarch64-lit-all", cl::Hidden, cl::init(true), + cl::desc("Disable AArch64 Loop Idiom Transform Pass.")); + +static cl::opt<bool> DisableByteCmp( + "disable-aarch64-lit-bytecmp", cl::Hidden, cl::init(false), + cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " + "not convert byte-compare loop(s).")); + +static cl::opt<bool> VerifyLoops( + "aarch64-lit-verify", cl::Hidden, cl::init(false), + cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass.")); + +namespace llvm { + +void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); +Pass *createAArch64LoopIdiomTransformPass(); + +} // end namespace llvm + +namespace { + +class AArch64LoopIdiomTransform { + Loop *CurLoop = nullptr; + DominatorTree *DT; + LoopInfo *LI; + const TargetTransformInfo *TTI; + const DataLayout *DL; + +public: + explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, + const TargetTransformInfo *TTI, + const DataLayout *DL) + : DT(DT), LI(LI), TTI(TTI), DL(DL) {} + + bool run(Loop *L); + +private: + /// \name Countable Loop Idiom Handling + /// @{ + + bool runOnCountableLoop(); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock *> &ExitBlocks); + + bool recognizeByteCompare(); + Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, + GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, + Instruction *Index, Value *Start, Value *MaxLen); + void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, + PHINode *IndPhi, Value *MaxLen, Instruction *Index, + Value *Start, bool IncIdx, BasicBlock *FoundBB, + BasicBlock *EndBB); + /// @} +}; + +class AArch64LoopIdiomTransformLegacyPass : public LoopPass { +public: + static char ID; + + explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { + initializeAArch64LoopIdiomTransformLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Transform AArch64-specific loop idioms"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; +}; + +bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, + LPPassManager &LPM) { + + if (skipLoop(L)) + return false; + + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( + *L->getHeader()->getParent()); + return AArch64LoopIdiomTransform( + DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) + .run(L); +} + +} // end anonymous namespace + +char AArch64LoopIdiomTransformLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN( + AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", + "Transform specific loop idioms into optimized vector forms", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END( + AArch64LoopIdiomTransformLegacyPass, "aarch64-lit", + "Transform specific loop idioms into optimized vector forms", false, false) + +Pass *llvm::createAArch64LoopIdiomTransformPass() { + return new AArch64LoopIdiomTransformLegacyPass(); +} + +PreservedAnalyses +AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + if (DisableAll) + return PreservedAnalyses::all(); + + const auto *DL = &L.getHeader()->getModule()->getDataLayout(); + + AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); + if (!LIT.run(&L)) + return PreservedAnalyses::all(); + + return PreservedAnalyses::none(); +} + +//===----------------------------------------------------------------------===// +// +// Implementation of AArch64LoopIdiomTransform +// +//===----------------------------------------------------------------------===// + +bool AArch64LoopIdiomTransform::run(Loop *L) { + CurLoop = L; + + if (DisableAll || L->getHeader()->getParent()->hasOptSize()) + return false; + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!L->getLoopPreheader()) + return false; + + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" + << CurLoop->getHeader()->getParent()->getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + + return recognizeByteCompare(); +} + +bool AArch64LoopIdiomTransform::recognizeByteCompare() { + // Currently the transformation only works on scalable vector types, although + // there is no fundamental reason why it cannot be made to work for fixed + // width too. + + // We also need to know the minimum page size for the target in order to + // generate runtime memory checks to ensure the vector version won't fault. + if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || + DisableByteCmp) + return false; + + BasicBlock *Header = CurLoop->getHeader(); + + // In AArch64LoopIdiomTransform::run we have already checked that the loop + // has a preheader so we can assume it's in a canonical form. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) + return false; + + PHINode *PN = dyn_cast<PHINode>(&Header->front()); + if (!PN || PN->getNumIncomingValues() != 2) + return false; + + auto LoopBlocks = CurLoop->getBlocks(); + // The first block in the loop should contain only 4 instructions, e.g. + // + // while.cond: + // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ] + // %inc = add i32 %res.phi, 1 + // %cmp.not = icmp eq i32 %inc, %n + // br i1 %cmp.not, label %while.end, label %while.body + // + auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug(); + if (std::distance(CondBBInsts.begin(), CondBBInsts.end()) > 4) + return false; + + // The second block should contain 7 instructions, e.g. + // + // while.body: + // %idx = zext i32 %inc to i64 + // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx + // %load.a = load i8, ptr %idx.a + // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx + // %load.b = load i8, ptr %idx.b + // %cmp.not.ld = icmp eq i8 %load.a, %load.b + // br i1 %cmp.not.ld, label %while.cond, label %while.end + // + auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug(); + if (std::distance(LoopBBInsts.begin(), LoopBBInsts.end()) > 7) + return false; + + // The incoming value to the PHI node from the loop should be an add of 1. + Value *StartIdx = nullptr; + Instruction *Index = nullptr; + if (!CurLoop->contains(PN->getIncomingBlock(0))) { + StartIdx = PN->getIncomingValue(0); + Index = dyn_cast<Instruction>(PN->getIncomingValue(1)); + } else { + StartIdx = PN->getIncomingValue(1); + Index = dyn_cast<Instruction>(PN->getIncomingValue(0)); + } + + // Limit to 32-bit types for now + if (!Index || !Index->getType()->isIntegerTy(32) || + !match(Index, m_c_Add(m_Specific(PN), m_One()))) + return false; + + // If we match the pattern, PN and Index will be replaced with the result of + // the cttz.elts intrinsic. If any other instructions are used outside of + // the loop, we cannot replace it. + for (BasicBlock *BB : LoopBlocks) + for (Instruction &I : *BB) + if (&I != PN && &I != Index) + for (User *U : I.users()) + if (!CurLoop->contains(cast<Instruction>(U))) + return false; + + // Match the branch instruction for the header + ICmpInst::Predicate Pred; + Value *MaxLen; + BasicBlock *EndBB, *WhileBB; + if (!match(Header->getTerminator(), + m_Br(m_ICmp(Pred, m_Specific(Index), m_Value(MaxLen)), + m_BasicBlock(EndBB), m_BasicBlock(WhileBB))) || + Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(WhileBB)) + return false; + + // WhileBB should contain the pattern of load & compare instructions. Match + // the pattern and find the GEP instructions used by the loads. + ICmpInst::Predicate WhilePred; + BasicBlock *FoundBB; + BasicBlock *TrueBB; + Value *LoadA, *LoadB; + if (!match(WhileBB->getTerminator(), + m_Br(m_ICmp(WhilePred, m_Value(LoadA), m_Value(LoadB)), + m_BasicBlock(TrueBB), m_BasicBlock(FoundBB))) || + WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(TrueBB)) + return false; + + Value *A, *B; + if (!match(LoadA, m_Load(m_Value(A))) || !match(LoadB, m_Load(m_Value(B)))) + return false; + + LoadInst *LoadAI = cast<LoadInst>(LoadA); + LoadInst *LoadBI = cast<LoadInst>(LoadB); + if (!LoadAI->isSimple() || !LoadBI->isSimple()) + return false; + + GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(A); + GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(B); + + if (!GEPA || !GEPB) + return false; + + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // Check we are loading i8 values from two loop invariant pointers + if (!CurLoop->isLoopInvariant(PtrA) || !CurLoop->isLoopInvariant(PtrB) || + !GEPA->getResultElementType()->isIntegerTy(8) || + !GEPB->getResultElementType()->isIntegerTy(8) || + !LoadAI->getType()->isIntegerTy(8) || + !LoadBI->getType()->isIntegerTy(8) || PtrA == PtrB) + return false; + + // Check that the index to the GEPs is the index we found earlier + if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1) + return false; + + Value *IdxA = GEPA->getOperand(GEPA->getNumIndices()); + Value *IdxB = GEPB->getOperand(GEPB->getNumIndices()); + if (IdxA != IdxB || !match(IdxA, m_ZExt(m_Specific(Index)))) + return false; + + // We only ever expect the pre-incremented index value to be used inside the + // loop. + if (!PN->hasOneUse()) + return false; + + // Ensure that when the Found and End blocks are identical the PHIs have the + // supported format. We don't currently allow cases like this: + // while.cond: + // ... + // br i1 %cmp.not, label %while.end, label %while.body + // + // while.body: + // ... + // br i1 %cmp.not2, label %while.cond, label %while.end + // + // while.end: + // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] + // + // Where the incoming values for %final_ptr are unique and from each of the + // loop blocks, but not actually defined in the loop. This requires extra + // work setting up the byte.compare block, i.e. by introducing a select to + // choose the correct value. + // TODO: We could add support for this in future. + if (FoundBB == EndBB) { + for (PHINode &EndPN : EndBB->phis()) { + Value *WhileCondVal = EndPN.getIncomingValueForBlock(Header); + Value *WhileBodyVal = EndPN.getIncomingValueForBlock(WhileBB); + + // The value of the index when leaving the while.cond block is always the + // same as the end value (MaxLen) so we permit either. Otherwise for any + // other value defined outside the loop we only allow values that are the + // same as the exit value for while.body. + if (WhileCondVal != Index && WhileCondVal != MaxLen && + WhileCondVal != WhileBodyVal) + return false; + } + } + + LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" + << *(EndBB->getParent()) << "\n\n"); + + // The index is incremented before the GEP/Load pair so we need to + // add 1 to the start value. + transformByteCompare(GEPA, GEPB, PN, MaxLen, Index, StartIdx, /*IncIdx=*/true, + FoundBB, EndBB); + return true; +} + +Value *AArch64LoopIdiomTransform::expandFindMismatch( + IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, + GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { + Value *PtrA = GEPA->getPointerOperand(); + Value *PtrB = GEPB->getPointerOperand(); + + // Get the arguments and types for the intrinsic. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator()); + LLVMContext &Ctx = PHBranch->getContext(); + Type *LoadType = Type::getInt8Ty(Ctx); + Type *ResType = Builder.getInt32Ty(); + + // Split block in the original loop preheader. + BasicBlock *EndBlock = + SplitBlock(Preheader, PHBranch, DT, LI, nullptr, "mismatch_end"); + + // Create the blocks that we're going to need: + // 1. A block for checking the zero-extended length exceeds 0 + // 2. A block to check that the start and end addresses of a given array + // lie on the same page. + // 3. The SVE loop preheader. + // 4. The first SVE loop block. + // 5. The SVE loop increment block. + // 6. A block we can jump to from the SVE loop when a mismatch is found. + // 7. The first block of the scalar loop itself, containing PHIs , loads + // and cmp. + // 8. A scalar loop increment block to increment the PHIs and go back + // around the loop. + + BasicBlock *MinItCheckBlock = BasicBlock::Create( + Ctx, "mismatch_min_it_check", EndBlock->getParent(), EndBlock); + + // Update the terminator added by SplitBlock to branch to the first block + Preheader->getTerminator()->setSuccessor(0, MinItCheckBlock); + + BasicBlock *MemCheckBlock = BasicBlock::Create( + Ctx, "mismatch_mem_check", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopPreheaderBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_preheader", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopStartBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_inc", EndBlock->getParent(), EndBlock); + + BasicBlock *SVELoopMismatchBlock = BasicBlock::Create( + Ctx, "mismatch_sve_loop_found", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopPreHeaderBlock = BasicBlock::Create( + Ctx, "mismatch_loop_pre", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopStartBlock = + BasicBlock::Create(Ctx, "mismatch_loop", EndBlock->getParent(), EndBlock); + + BasicBlock *LoopIncBlock = BasicBlock::Create( + Ctx, "mismatch_loop_inc", EndBlock->getParent(), EndBlock); + + DTU.applyUpdates({{DominatorTree::Insert, Preheader, MinItCheckBlock}, + {DominatorTree::Delete, Preheader, EndBlock}}); + + // Update LoopInfo with the new SVE & scalar loops. + auto SVELoop = LI->AllocateLoop(); + auto ScalarLoop = LI->AllocateLoop(); + + if (CurLoop->getParentLoop()) { + CurLoop->getParentLoop()->addBasicBlockToLoop(MinItCheckBlock, *LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(MemCheckBlock, *LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopPreheaderBlock, *LI); + CurLoop->getParentLoop()->addChildLoop(SVELoop); + CurLoop->getParentLoop()->addBasicBlockToLoop(SVELoopMismatchBlock, *LI); + CurLoop->getParentLoop()->addBasicBlockToLoop(LoopPreHeaderBlock, *LI); + CurLoop->getParentLoop()->addChildLoop(ScalarLoop); + } else { + LI->addTopLevelLoop(SVELoop); + LI->addTopLevelLoop(ScalarLoop); + } + + // Add the new basic blocks to their associated loops. + SVELoop->addBasicBlockToLoop(SVELoopStartBlock, *LI); + SVELoop->addBasicBlockToLoop(SVELoopIncBlock, *LI); + + ScalarLoop->addBasicBlockToLoop(LoopStartBlock, *LI); + ScalarLoop->addBasicBlockToLoop(LoopIncBlock, *LI); + + // Set up some types and constants that we intend to reuse. + Type *I64Type = Builder.getInt64Ty(); + + // Check the zero-extended iteration count > 0 + Builder.SetInsertPoint(MinItCheckBlock); + Value *ExtStart = Builder.CreateZExt(Start, I64Type); + Value *ExtEnd = Builder.CreateZExt(MaxLen, I64Type); + // This check doesn't really cost us very much. + + Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen); + BranchInst *MinItCheckBr = + BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck); + MinItCheckBr->setMetadata( + LLVMContext::MD_prof, + MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1)); + Builder.Insert(MinItCheckBr); + + DTU.applyUpdates( + {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, + {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); + + // For each of the arrays, check the start/end addresses are on the same + // page. + Builder.SetInsertPoint(MemCheckBlock); + + // The early exit in the original loop means that when performing vector + // loads we are potentially reading ahead of the early exit. So we could + // fault if crossing a page boundary. Therefore, we create runtime memory + // checks based on the minimum page size as follows: + // 1. Calculate the addresses of the first memory accesses in the loop, + // i.e. LhsStart and RhsStart. + // 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd. + // 3. Determine which pages correspond to all the memory accesses, i.e + // LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage. + // 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then + // we know we won't cross any page boundaries in the loop so we can + // enter the vector loop! Otherwise we fall back on the scalar loop. + Value *LhsStartGEP = Builder.CreateGEP(LoadType, PtrA, ExtStart); + Value *RhsStartGEP = Builder.CreateGEP(LoadType, PtrB, ExtStart); + Value *RhsStart = Builder.CreatePtrToInt(RhsStartGEP, I64Type); + Value *LhsStart = Builder.CreatePtrToInt(LhsStartGEP, I64Type); + Value *LhsEndGEP = Builder.CreateGEP(LoadType, PtrA, ExtEnd); + Value *RhsEndGEP = Builder.CreateGEP(LoadType, PtrB, ExtEnd); + Value *LhsEnd = Builder.CreatePtrToInt(LhsEndGEP, I64Type); + Value *RhsEnd = Builder.CreatePtrToInt(RhsEndGEP, I64Type); + + const uint64_t MinPageSize = TTI->getMinPageSize().value(); + const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize); + Value *LhsStartPage = Builder.CreateLShr(LhsStart, AddrShiftAmt); + Value *LhsEndPage = Builder.CreateLShr(LhsEnd, AddrShiftAmt); + Value *RhsStartPage = Builder.CreateLShr(RhsStart, AddrShiftAmt); + Value *RhsEndPage = Builder.CreateLShr(RhsEnd, AddrShiftAmt); + Value *LhsPageCmp = Builder.CreateICmpNE(LhsStartPage, LhsEndPage); + Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage); + + Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp); + BranchInst *CombinedPageCmpCmpBr = BranchInst::Create( + LoopPreHeaderBlock, SVELoopPreheaderBlock, CombinedPageCmp); + CombinedPageCmpCmpBr->setMetadata( + LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext()) + .createBranchWeights(10, 90)); + Builder.Insert(CombinedPageCmpCmpBr); + + DTU.applyUpdates( + {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, + {DominatorTree::Insert, MemCheckBlock, SVELoopPreheaderBlock}}); + + // Set up the SVE loop preheader, i.e. calculate initial loop predicate, + // zero-extend MaxLen to 64-bits, determine the number of vector elements + // processed in each iteration, etc. + Builder.SetInsertPoint(SVELoopPreheaderBlock); + + // At this point we know two things must be true: + // 1. Start <= End + // 2. ExtMaxLen <= MinPageSize due to the page checks. + // Therefore, we know that we can use a 64-bit induction variable that + // starts from 0 -> ExtMaxLen and it will not overflow. + ScalableVectorType *PredVTy = + ScalableVectorType::get(Builder.getInt1Ty(), 16); + + Value *InitialPred = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); + + Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); + VecLen = Builder.CreateMul(VecLen, ConstantInt::get(I64Type, 16), "", + /*HasNUW=*/true, /*HasNSW=*/true); + + Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(), + Builder.getInt1(false)); + + BranchInst *JumpToSVELoop = BranchInst::Create(SVELoopStartBlock); + Builder.Insert(JumpToSVELoop); + + DTU.applyUpdates( + {{DominatorTree::Insert, SVELoopPreheaderBlock, SVELoopStartBlock}}); + + // Set up the first SVE loop block by creating the PHIs, doing the vector + // loads and comparing the vectors. + Builder.SetInsertPoint(SVELoopStartBlock); + PHINode *LoopPred = Builder.CreatePHI(PredVTy, 2, "mismatch_sve_loop_pred"); + LoopPred->addIncoming(InitialPred, SVELoopPreheaderBlock); + PHINode *SVEIndexPhi = Builder.CreatePHI(I64Type, 2, "mismatch_sve_index"); + SVEIndexPhi->addIncoming(ExtStart, SVELoopPreheaderBlock); + Type *SVELoadType = ScalableVectorType::get(Builder.getInt8Ty(), 16); + Value *Passthru = ConstantInt::getNullValue(SVELoadType); + + Value *SVELhsGep = Builder.CreateGEP(LoadType, PtrA, SVEIndexPhi); + if (GEPA->isInBounds()) + cast<GetElementPtrInst>(SVELhsGep)->setIsInBounds(true); + Value *SVELhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVELhsGep, Align(1), + LoopPred, Passthru); + + Value *SVERhsGep = Builder.CreateGEP(LoadType, PtrB, SVEIndexPhi); + if (GEPB->isInBounds()) + cast<GetElementPtrInst>(SVERhsGep)->setIsInBounds(true); + Value *SVERhsLoad = Builder.CreateMaskedLoad(SVELoadType, SVERhsGep, Align(1), + LoopPred, Passthru); + + Value *SVEMatchCmp = Builder.CreateICmpNE(SVELhsLoad, SVERhsLoad); + SVEMatchCmp = Builder.CreateSelect(LoopPred, SVEMatchCmp, PFalse); + Value *SVEMatchHasActiveLanes = Builder.CreateOrReduce(SVEMatchCmp); + BranchInst *SVEEarlyExit = BranchInst::Create( + SVELoopMismatchBlock, SVELoopIncBlock, SVEMatchHasActiveLanes); + Builder.Insert(SVEEarlyExit); + + DTU.applyUpdates( + {{DominatorTree::Insert, SVELoopStartBlock, SVELoopMismatchBlock}, + {DominatorTree::Insert, SVELoopStartBlock, SVELoopIncBlock}}); + + // Increment the index counter and calculate the predicate for the next + // iteration of the loop. We branch back to the start of the loop if there + // is at least one active lane. + Builder.SetInsertPoint(SVELoopIncBlock); + Value *NewSVEIndexPhi = Builder.CreateAdd(SVEIndexPhi, VecLen, "", + /*HasNUW=*/true, /*HasNSW=*/true); + SVEIndexPhi->addIncoming(NewSVEIndexPhi, SVELoopIncBlock); + Value *NewPred = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, + {PredVTy, I64Type}, {NewSVEIndexPhi, ExtEnd}); + LoopPred->addIncoming(NewPred, SVELoopIncBlock); + + Value *PredHasActiveLanes = + Builder.CreateExtractElement(NewPred, uint64_t(0)); + BranchInst *SVELoopBranchBack = + BranchInst::Create(SVELoopStartBlock, EndBlock, PredHasActiveLanes); + Builder.Insert(SVELoopBranchBack); + + DTU.applyUpdates({{DominatorTree::Insert, SVELoopIncBlock, SVELoopStartBlock}, + {DominatorTree::Insert, SVELoopIncBlock, EndBlock}}); + + // If we found a mismatch then we need to calculate which lane in the vector + // had a mismatch and add that on to the current loop index. + Builder.SetInsertPoint(SVELoopMismatchBlock); + PHINode *FoundPred = Builder.CreatePHI(PredVTy, 1, "mismatch_sve_found_pred"); + FoundPred->addIncoming(SVEMatchCmp, SVELoopStartBlock); + PHINode *LastLoopPred = + Builder.CreatePHI(PredVTy, 1, "mismatch_sve_last_loop_pred"); + LastLoopPred->addIncoming(LoopPred, SVELoopStartBlock); + PHINode *SVEFoundIndex = + Builder.CreatePHI(I64Type, 1, "mismatch_sve_found_index"); + SVEFoundIndex->addIncoming(SVEIndexPhi, SVELoopStartBlock); + + Value *PredMatchCmp = Builder.CreateAnd(LastLoopPred, FoundPred); + Value *Ctz = Builder.CreateIntrinsic( + Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, + {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(true)}); + Ctz = Builder.CreateZExt(Ctz, I64Type); + Value *SVELoopRes64 = Builder.CreateAdd(SVEFoundIndex, Ctz, "", + /*HasNUW=*/true, /*HasNSW=*/true); + Value *SVELoopRes = Builder.CreateTrunc(SVELoopRes64, ResType); + + Builder.Insert(BranchInst::Create(EndBlock)); + + DTU.applyUpdates({{DominatorTree::Insert, SVELoopMismatchBlock, EndBlock}}); + + // Generate code for scalar loop. + Builder.SetInsertPoint(LoopPreHeaderBlock); + Builder.Insert(BranchInst::Create(LoopStartBlock)); + + DTU.applyUpdates( + {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); + + Builder.SetInsertPoint(LoopStartBlock); + PHINode *IndexPhi = Builder.CreatePHI(ResType, 2, "mismatch_index"); + IndexPhi->addIncoming(Start, LoopPreHeaderBlock); + + // Otherwise compare the values + // Load bytes from each array and compare them. + Value *GepOffset = Builder.CreateZExt(IndexPhi, I64Type); + + Value *LhsGep = Builder.CreateGEP(LoadType, PtrA, GepOffset); + if (GEPA->isInBounds()) + cast<GetElementPtrInst>(LhsGep)->setIsInBounds(true); + Value *LhsLoad = Builder.CreateLoad(LoadType, LhsGep); + + Value *RhsGep = Builder.CreateGEP(LoadType, PtrB, GepOffset); + if (GEPB->isInBounds()) + cast<GetElementPtrInst>(RhsGep)->setIsInBounds(true); + Value *RhsLoad = Builder.CreateLoad(LoadType, RhsGep); + + Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad); + // If we have a mismatch then exit the loop ... + BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp); + Builder.Insert(MatchCmpBr); + + DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, + {DominatorTree::Insert, LoopStartBlock, EndBlock}}); + + // Have we reached the maximum permitted length for the loop? + Builder.SetInsertPoint(LoopIncBlock); + Value *PhiInc = Builder.CreateAdd(IndexPhi, ConstantInt::get(ResType, 1), "", + /*HasNUW=*/Index->hasNoUnsignedWrap(), + /*HasNSW=*/Index->hasNoSignedWrap()); + IndexPhi->addIncoming(PhiInc, LoopIncBlock); + Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen); + BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp); + Builder.Insert(IVCmpBr); + + DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock}, + {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); + + // In the end block we need to insert a PHI node to deal with three cases: + // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen. + // 2. We exitted the scalar loop early due to a mismatch and need to return + // the index that we found. + // 3. We didn't find a mismatch in the SVE loop, so we return MaxLen. + // 4. We exitted the SVE loop early due to a mismatch and need to return + // the index that we found. + Builder.SetInsertPoint(EndBlock, EndBlock->getFirstInsertionPt()); + PHINode *ResPhi = Builder.CreatePHI(ResType, 4, "mismatch_result"); + ResPhi->addIncoming(MaxLen, LoopIncBlock); + ResPhi->addIncoming(IndexPhi, LoopStartBlock); + ResPhi->addIncoming(MaxLen, SVELoopIncBlock); + ResPhi->addIncoming(SVELoopRes, SVELoopMismatchBlock); + + Value *FinalRes = Builder.CreateTrunc(ResPhi, ResType); + + if (VerifyLoops) { + ScalarLoop->verifyLoop(); + SVELoop->verifyLoop(); + if (!SVELoop->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + if (!ScalarLoop->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + } + + return FinalRes; +} + +void AArch64LoopIdiomTransform::transformByteCompare( + GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, + Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, + BasicBlock *FoundBB, BasicBlock *EndBB) { + + // Insert the byte compare code at the end of the preheader block + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BasicBlock *Header = CurLoop->getHeader(); + BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator()); + IRBuilder<> Builder(PHBranch); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); + + // Increment the pointer if this was done before the loads in the loop. + if (IncIdx) + Start = Builder.CreateAdd(Start, ConstantInt::get(Start->getType(), 1)); + + Value *ByteCmpRes = + expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen); + + // Replaces uses of index & induction Phi with intrinsic (we already + // checked that the the first instruction of Header is the Phi above). + assert(IndPhi->hasOneUse() && "Index phi node has more than one use!"); + Index->replaceAllUsesWith(ByteCmpRes); + + assert(PHBranch->isUnconditional() && + "Expected preheader to terminate with an unconditional branch."); + + // If no mismatch was found, we can jump to the end block. Create a + // new basic block for the compare instruction. + auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare", + Preheader->getParent()); + CmpBB->moveBefore(EndBB); + + // Replace the branch in the preheader with an always-true conditional branch. + // This ensures there is still a reference to the original loop. + Builder.CreateCondBr(Builder.getTrue(), CmpBB, Header); + PHBranch->eraseFromParent(); + + BasicBlock *MismatchEnd = cast<Instruction>(ByteCmpRes)->getParent(); + DTU.applyUpdates({{DominatorTree::Insert, MismatchEnd, CmpBB}}); + + // Create the branch to either the end or found block depending on the value + // returned by the intrinsic. + Builder.SetInsertPoint(CmpBB); + if (FoundBB != EndBB) { + Value *FoundCmp = Builder.CreateICmpEQ(ByteCmpRes, MaxLen); + Builder.CreateCondBr(FoundCmp, EndBB, FoundBB); + DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}, + {DominatorTree::Insert, CmpBB, EndBB}}); + + } else { + Builder.CreateBr(FoundBB); + DTU.applyUpdates({{DominatorTree::Insert, CmpBB, FoundBB}}); + } + + auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { + for (PHINode &PN : SuccBB->phis()) { + // At this point we've already replaced all uses of the result from the + // loop with ByteCmp. Look through the incoming values to find ByteCmp, + // meaning this is a Phi collecting the results of the byte compare. + bool ResPhi = false; + for (Value *Op : PN.incoming_values()) + if (Op == ByteCmpRes) { + ResPhi = true; + break; + } + + // Any PHI that depended upon the result of the byte compare needs a new + // incoming value from CmpBB. This is because the original loop will get + // deleted. + if (ResPhi) + PN.addIncoming(ByteCmpRes, CmpBB); + else { + // There should be no other outside uses of other values in the + // original loop. Any incoming values should either: + // 1. Be for blocks outside the loop, which aren't interesting. Or .. + // 2. These are from blocks in the loop with values defined outside + // the loop. We should a similar incoming value from CmpBB. + for (BasicBlock *BB : PN.blocks()) + if (CurLoop->contains(BB)) { + PN.addIncoming(PN.getIncomingValueForBlock(BB), CmpBB); + break; + } + } + } + }; + + // Ensure all Phis in the successors of CmpBB have an incoming value from it. + fixSuccessorPhis(EndBB); + if (EndBB != FoundBB) + fixSuccessorPhis(FoundBB); + + // The new CmpBB block isn't part of the loop, but will need to be added to + // the outer loop if there is one. + if (!CurLoop->isOutermost()) + CurLoop->getParentLoop()->addBasicBlockToLoop(CmpBB, *LI); + + if (VerifyLoops && CurLoop->getParentLoop()) { + CurLoop->getParentLoop()->verifyLoop(); + if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + } +} diff --git a/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h new file mode 100644 index 000000000000..cc68425bb68b --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64LoopIdiomTransform.h @@ -0,0 +1,25 @@ +//===- AArch64LoopIdiomTransform.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H +#define LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" + +namespace llvm { + +struct AArch64LoopIdiomTransformPass + : PassInfoMixin<AArch64LoopIdiomTransformPass> { + PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, LPMUpdater &U); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AARCH64_AARCH64LOOPIDIOMTRANSFORM_H diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 738a52eebad2..380f6e1fcfda 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -810,7 +810,7 @@ defm FMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmopa", 0b0, 0b0, 0b11, ZPR16>; defm FMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"fmops", 0b0, 0b1, 0b11, ZPR16>; } -let Predicates = [HasSME2p1, HasB16B16] in { +let Predicates = [HasSME2, HasB16B16] in { defm BFADD_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfadd", 0b1100, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; defm BFADD_VG4_M4Z_H : sme2_multivec_accum_add_sub_vg4<"bfadd", 0b1100, MatrixOp16, ZZZZ_h_mul_r, nxv8bf16, null_frag>; defm BFSUB_VG2_M2Z_H : sme2_multivec_accum_add_sub_vg2<"bfsub", 0b1101, MatrixOp16, ZZ_h_mul_r, nxv8bf16, null_frag>; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 344a15389063..ee10a7d1c706 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -453,6 +453,9 @@ def AArch64msb_m1 : PatFrags<(ops node:$pred, node:$op1, node:$op2, node:$op3), def AArch64eor3 : PatFrags<(ops node:$op1, node:$op2, node:$op3), [(int_aarch64_sve_eor3 node:$op1, node:$op2, node:$op3), (xor node:$op1, (xor node:$op2, node:$op3))]>; +def AArch64bcax : PatFrags<(ops node:$op1, node:$op2, node:$op3), + [(int_aarch64_sve_bcax node:$op1, node:$op2, node:$op3), + (xor node:$op1, (and node:$op2, (vnot node:$op3)))]>; def AArch64fmla_m1 : PatFrags<(ops node:$pg, node:$za, node:$zn, node:$zm), [(int_aarch64_sve_fmla node:$pg, node:$za, node:$zn, node:$zm), @@ -3714,7 +3717,7 @@ let Predicates = [HasSVE2orSME] in { // SVE2 bitwise ternary operations defm EOR3_ZZZZ : sve2_int_bitwise_ternary_op<0b000, "eor3", AArch64eor3>; - defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", int_aarch64_sve_bcax>; + defm BCAX_ZZZZ : sve2_int_bitwise_ternary_op<0b010, "bcax", AArch64bcax>; defm BSL_ZZZZ : sve2_int_bitwise_ternary_op<0b001, "bsl", int_aarch64_sve_bsl, AArch64bsp>; defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>; defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>; diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 1a76f354589e..9e43f206efcf 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -172,7 +172,7 @@ static SDValue EmitUnrolledSetTag(SelectionDAG &DAG, const SDLoc &dl, SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Addr, SDValue Size, MachinePointerInfo DstPtrInfo, bool ZeroData) const { - uint64_t ObjSize = cast<ConstantSDNode>(Size)->getZExtValue(); + uint64_t ObjSize = Size->getAsZExtVal(); assert(ObjSize % 16 == 0); MachineFunction &MF = DAG.getMachineFunction(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 036719be06d8..144610e021c5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -11,6 +11,7 @@ #include "AArch64TargetMachine.h" #include "AArch64.h" +#include "AArch64LoopIdiomTransform.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64MachineScheduler.h" #include "AArch64MacroFusion.h" @@ -43,6 +44,7 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Pass.h" +#include "llvm/Passes/PassBuilder.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLoweringObjectFile.h" @@ -222,6 +224,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { initializeAArch64DeadRegisterDefinitionsPass(*PR); initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); + initializeAArch64LoopIdiomTransformLegacyPassPass(*PR); initializeAArch64MIPeepholeOptPass(*PR); initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64O0PreLegalizerCombinerPass(*PR); @@ -537,6 +540,14 @@ public: } // end anonymous namespace +void AArch64TargetMachine::registerPassBuilderCallbacks( + PassBuilder &PB, bool PopulateClassToPassNames) { + PB.registerLateLoopOptimizationsEPCallback( + [=](LoopPassManager &LPM, OptimizationLevel Level) { + LPM.addPass(AArch64LoopIdiomTransformPass()); + }); +} + TargetTransformInfo AArch64TargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(AArch64TTIImpl(this, F)); diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h index 12b971853f84..8fb68b06f137 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H #include "AArch64InstrInfo.h" +#include "AArch64LoopIdiomTransform.h" #include "AArch64Subtarget.h" #include "llvm/IR/DataLayout.h" #include "llvm/Target/TargetMachine.h" @@ -43,6 +44,9 @@ public: // Pass Pipeline Configuration TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + void registerPassBuilderCallbacks(PassBuilder &PB, + bool PopulateClassToPassNames) override; + TargetTransformInfo getTargetTransformInfo(const Function &F) const override; TargetLoweringObjectFile* getObjFileLowering() const override { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 0b220069a388..f471294ffc25 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -420,6 +420,8 @@ public: return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); } + + std::optional<unsigned> getMinPageSize() const { return 4096; } }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 1d0e8be80d07..b657a0954d78 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -282,6 +282,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // Regardless of FP16 support, widen 16-bit elements to 32-bits. .minScalar(0, s32) .libcallFor({s32, s64}); + getActionDefinitionsBuilder(G_FPOWI) + .scalarize(0) + .minScalar(0, s32) + .libcallFor({{s32, s32}, {s64, s32}}); getActionDefinitionsBuilder(G_INSERT) .legalIf(all(typeInSet(0, {s32, s64, p0}), @@ -362,7 +366,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}}) // These extends are also legal - .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 8}}) + .legalForTypesWithMemDesc( + {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}}) .widenScalarToNextPow2(0, /* MinSize = */ 8) .lowerIfMemSizeNotByteSizePow2() .clampScalar(0, s8, s64) @@ -761,17 +766,35 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .lowerIf( all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0))); + LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) { + return ST.outlineAtomics() && !ST.hasLSE(); + }; + getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) - .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))) - .customIf([](const LegalityQuery &Query) { - return Query.Types[0].getSizeInBits() == 128; + .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), + predNot(UseOutlineAtomics))) + .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics))) + .customIf([UseOutlineAtomics](const LegalityQuery &Query) { + return Query.Types[0].getSizeInBits() == 128 && + !UseOutlineAtomics(Query); }) + .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0), + UseOutlineAtomics)) + .clampScalar(0, s32, s64); + + getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, + G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, + G_ATOMICRMW_XOR}) + .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0), + predNot(UseOutlineAtomics))) + .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0), + UseOutlineAtomics)) .clampScalar(0, s32, s64); + // Do not outline these atomics operations, as per comment in + // AArch64ISelLowering.cpp's shouldExpandAtomicRMWInIR(). getActionDefinitionsBuilder( - {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, - G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, - G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) + {G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN, G_ATOMICRMW_UMAX}) .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0))) .clampScalar(0, s32, s64); @@ -989,6 +1012,23 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampMaxNumElements(1, s16, 8) .lower(); + // For fmul reductions we need to split up into individual operations. We + // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of + // smaller types, followed by scalarizing what remains. + getActionDefinitionsBuilder(G_VECREDUCE_FMUL) + .minScalarOrElt(0, MinFPScalar) + .clampMaxNumElements(1, s64, 2) + .clampMaxNumElements(1, s32, 4) + .clampMaxNumElements(1, s16, 8) + .clampMaxNumElements(1, s32, 2) + .clampMaxNumElements(1, s16, 4) + .scalarize(1) + .lower(); + + getActionDefinitionsBuilder({G_VECREDUCE_SEQ_FADD, G_VECREDUCE_SEQ_FMUL}) + .scalarize(2) + .lower(); + getActionDefinitionsBuilder(G_VECREDUCE_ADD) .legalFor({{s8, v16s8}, {s8, v8s8}, @@ -1137,8 +1177,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) verify(*ST.getInstrInfo()); } -bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, - MachineInstr &MI) const { +bool AArch64LegalizerInfo::legalizeCustom( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); GISelChangeObserver &Observer = Helper.Observer; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index 19f77baa77f8..c62a9d847c52 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -23,12 +23,12 @@ namespace llvm { class AArch64Subtarget; -/// This class provides the information for the target register banks. class AArch64LegalizerInfo : public LegalizerInfo { public: AArch64LegalizerInfo(const AArch64Subtarget &ST); - bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const override; bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index b7552541e950..789ec817d3d8 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -10082,6 +10082,12 @@ multiclass sve2p1_vector_to_pred<string mnemonic, SDPatternOperator Op_lane, SDP def : InstAlias<mnemonic # "\t$Pd, $Zn", (!cast<Instruction>(NAME # _B) PPR8:$Pd, ZPRAny:$Zn, 0), 1>; + def : InstAlias<mnemonic # "\t$Pd, $Zn", + (!cast<Instruction>(NAME # _H) PPR16:$Pd, ZPRAny:$Zn, 0), 0>; + def : InstAlias<mnemonic # "\t$Pd, $Zn", + (!cast<Instruction>(NAME # _S) PPR32:$Pd, ZPRAny:$Zn, 0), 0>; + def : InstAlias<mnemonic # "\t$Pd, $Zn", + (!cast<Instruction>(NAME # _D) PPR64:$Pd, ZPRAny:$Zn, 0), 0>; // any_lane def : Pat<(nxv16i1 (Op_lane (nxv16i8 ZPRAny:$Zn), (i32 timm32_0_0:$Idx))), @@ -10143,6 +10149,12 @@ multiclass sve2p1_pred_to_vector<string mnemonic, SDPatternOperator MergeOp, def : InstAlias<mnemonic # "\t$Zd, $Pn", (!cast<Instruction>(NAME # _B) ZPRAny:$Zd, 0, PPR8:$Pn), 1>; + def : InstAlias<mnemonic # "\t$Zd, $Pn", + (!cast<Instruction>(NAME # _H) ZPRAny:$Zd, 0, PPR16:$Pn), 0>; + def : InstAlias<mnemonic # "\t$Zd, $Pn", + (!cast<Instruction>(NAME # _S) ZPRAny:$Zd, 0, PPR32:$Pn), 0>; + def : InstAlias<mnemonic # "\t$Zd, $Pn", + (!cast<Instruction>(NAME # _D) ZPRAny:$Zd, 0, PPR64:$Pn), 0>; // Merge def : Pat<(nxv8i16 (MergeOp (nxv8i16 ZPRAny:$Zd), (nxv8i1 PPR16:$Pn), (i32 timm32_1_1:$Idx))), diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index d2a325d5ad89..df8c35ffd457 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -761,6 +761,12 @@ def FeatureShaderCyclesRegister : SubtargetFeature<"shader-cycles-register", "Has SHADER_CYCLES hardware register" >; +def FeatureShaderCyclesHiLoRegisters : SubtargetFeature<"shader-cycles-hi-lo-registers", + "HasShaderCyclesHiLoRegisters", + "true", + "Has SHADER_CYCLES_HI/LO hardware registers" +>; + def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts", "HasMadMacF32Insts", "true", @@ -1469,7 +1475,7 @@ def FeatureISAVersion12 : FeatureSet< FeatureNSAEncoding, FeaturePartialNSAEncoding, FeatureWavefrontSize32, - FeatureShaderCyclesRegister, + FeatureShaderCyclesHiLoRegisters, FeatureArchitectedFlatScratch, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, @@ -1970,6 +1976,8 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">, def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">, AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>; +def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegisters()">; + def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">, AssemblerPredicate<(all_of FeatureFP8Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td index 9036b26a6f6b..c5207228dc91 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -22,28 +22,28 @@ def CC_SI_Gfx : CallingConv<[ // 32 is reserved for the stack pointer // 33 is reserved for the frame pointer // 34 is reserved for the base pointer - CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[ SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29 ]>>>, - CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31 ]>>>, - CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>> + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>> ]>; def RetCC_SI_Gfx : CallingConv<[ CCIfType<[i1], CCPromoteToType<i32>>, CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, - CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -66,7 +66,7 @@ def RetCC_SI_Gfx : CallingConv<[ def CC_SI_SHADER : CallingConv<[ - CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[ SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, @@ -76,7 +76,7 @@ def CC_SI_SHADER : CallingConv<[ ]>>>, // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. - CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[ + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -109,7 +109,7 @@ def RetCC_SI_Shader : CallingConv<[ ]>>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. - CCIfType<[f32, f16, v2f16] , CCAssignToReg<[ + CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -188,23 +188,23 @@ def CC_AMDGPU_Func : CallingConv<[ CCIfType<[i1], CCPromoteToType<i32>>, CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>, - CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg< !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29 >>>, - CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[ + CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1, bf16, v2bf16], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>, - CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>> + CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>> ]>; // Calling convention for leaf functions def RetCC_AMDGPU_Func : CallingConv<[ CCIfType<[i1], CCPromoteToType<i32>>, CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>, - CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[ + CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, @@ -223,11 +223,11 @@ def CC_AMDGPU : CallingConv<[ ]>; def CC_AMDGPU_CS_CHAIN : CallingConv<[ - CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< + CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg< !foreach(i, !range(105), !cast<Register>("SGPR"#i)) >>>, - CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg< + CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg< !foreach(i, !range(8, 255), !cast<Register>("VGPR"#i)) >>> ]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 8d4cad4c07bc..0c77fe725958 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -104,6 +104,13 @@ def foldable_fneg : GICombineRule< [{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]), (apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>; +// Detects s_mul_u64 instructions whose higher bits are zero/sign extended. +def smulu64 : GICombineRule< + (defs root:$smul, unsigned_matchinfo:$matchinfo), + (match (wip_match_opcode G_MUL):$smul, + [{ return matchCombine_s_mul_u64(*${smul}, ${matchinfo}); }]), + (apply [{ applyCombine_s_mul_u64(*${smul}, ${matchinfo}); }])>; + def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">; def sign_extension_in_reg : GICombineRule< @@ -149,7 +156,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner< "AMDGPUPostLegalizerCombinerImpl", [all_combines, gfx6gfx7_combines, gfx8_combines, uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg, - rcp_sqrt_to_rsq, sign_extension_in_reg]> { + rcp_sqrt_to_rsq, sign_extension_in_reg, smulu64]> { let CombineAllMethodName = "tryCombineAllImpl"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b51a876750b5..74e9cd7d0965 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -646,7 +646,15 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs( Offset += 8; // Skipped. } - Offset += 72; // Reserved. + // Emit argument for hidden dynamic lds size + if (MFI.isDynamicLDSUsed()) { + emitKernelArg(DL, Int32Ty, Align(4), "hidden_dynamic_lds_size", Offset, + Args); + } else { + Offset += 4; // skipped + } + + Offset += 68; // Reserved. // hidden_private_base and hidden_shared_base are only when the subtarget has // ApertureRegs. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index bffea82ab8f4..719ae2e8750c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -303,6 +303,7 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() { switch (N->getOpcode()) { case ISD::BUILD_VECTOR: + // TODO: Match load d16 from shl (extload:i16), 16 MadeChange |= matchLoadD16FromBuildVector(N); break; default: @@ -317,26 +318,16 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() { } } -bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N, - bool Negated) const { +bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { if (N->isUndef()) return true; const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (Negated) { - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) - return TII->isInlineConstant(-C->getAPIntValue()); + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) + return TII->isInlineConstant(C->getAPIntValue()); - if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) - return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt()); - - } else { - if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) - return TII->isInlineConstant(C->getAPIntValue()); - - if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) - return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); - } + if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) + return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt()); return false; } @@ -382,7 +373,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, Subtarget->getRegisterInfo()->getRegClass(RCID); SDValue SubRegOp = N->getOperand(OpNo + 1); - unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue(); + unsigned SubRegIdx = SubRegOp->getAsZExtVal(); return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 374108af08cd..df4a211d42a0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -50,15 +50,13 @@ static inline bool getConstantValue(SDValue N, uint32_t &Out) { } // TODO: Handle undef as zero -static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, - bool Negate = false) { +static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2); uint32_t LHSVal, RHSVal; if (getConstantValue(N->getOperand(0), LHSVal) && getConstantValue(N->getOperand(1), RHSVal)) { SDLoc SL(N); - uint32_t K = Negate ? (-LHSVal & 0xffff) | (-RHSVal << 16) - : (LHSVal & 0xffff) | (RHSVal << 16); + uint32_t K = (LHSVal & 0xffff) | (RHSVal << 16); return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0), DAG.getTargetConstant(K, SL, MVT::i32)); } @@ -66,9 +64,6 @@ static inline SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG, return nullptr; } -static inline SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) { - return packConstantV2I16(N, DAG, true); -} } // namespace /// AMDGPU specific code to select AMDGPU machine instructions for @@ -110,10 +105,7 @@ protected: private: std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const; - bool isInlineImmediate(const SDNode *N, bool Negated = false) const; - bool isNegInlineImmediate(const SDNode *N) const { - return isInlineImmediate(N, true); - } + bool isInlineImmediate(const SDNode *N) const; bool isInlineImmediate16(int64_t Imm) const { return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 8fbc90a6db9f..0dbcaf5a1b13 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -387,17 +387,20 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32}, Custom); + + // FIXME: Why is v8f16/v8bf16 missing? setOperationAction( ISD::EXTRACT_SUBVECTOR, - {MVT::v2f16, MVT::v2i16, MVT::v4f16, MVT::v4i16, MVT::v2f32, - MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32, MVT::v4i32, - MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32, MVT::v7f32, - MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32, MVT::v9i32, - MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, - MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32, - MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64, - MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64, - MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16}, + {MVT::v2f16, MVT::v2bf16, MVT::v2i16, MVT::v4f16, MVT::v4bf16, + MVT::v4i16, MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, + MVT::v4f32, MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, + MVT::v6i32, MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, + MVT::v9f32, MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, + MVT::v11f32, MVT::v12i32, MVT::v12f32, MVT::v16f16, MVT::v16bf16, + MVT::v16i16, MVT::v16f32, MVT::v16i32, MVT::v32f32, MVT::v32i32, + MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64, + MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64, + MVT::v32i16, MVT::v32f16, MVT::v32bf16}, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); @@ -3281,7 +3284,15 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext); } - assert(SrcVT == MVT::i64 && "operation should be legal"); + if (DestVT == MVT::bf16) { + SDLoc SL(Op); + SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); + return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); + } + + if (SrcVT != MVT::i64) + return Op; if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { SDLoc DL(Op); @@ -3319,7 +3330,15 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext); } - assert(SrcVT == MVT::i64 && "operation should be legal"); + if (DestVT == MVT::bf16) { + SDLoc SL(Op); + SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src); + SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true); + return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag); + } + + if (SrcVT != MVT::i64) + return Op; // TODO: Factor out code common with LowerUINT_TO_FP. @@ -3517,7 +3536,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); } -SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op, +SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); unsigned OpOpcode = Op.getOpcode(); @@ -3528,6 +3547,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op, if (SrcVT == MVT::f16 && DestVT == MVT::i16) return Op; + if (SrcVT == MVT::bf16) { + SDLoc DL(Op); + SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); + return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc); + } + // Promote i16 to i32 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { SDLoc DL(Op); @@ -3536,6 +3561,9 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op, return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32); } + if (DestVT != MVT::i64) + return Op; + if (SrcVT == MVT::f16 || (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { SDLoc DL(Op); @@ -3546,7 +3574,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op, return DAG.getNode(Ext, DL, MVT::i64, FpToInt32); } - if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) + if (SrcVT == MVT::f32 || SrcVT == MVT::f64) return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT); return SDValue(); @@ -4947,7 +4975,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y)) if (DestVT.isVector()) { SDValue Src = N->getOperand(0); - if (Src.getOpcode() == ISD::BUILD_VECTOR) { + if (Src.getOpcode() == ISD::BUILD_VECTOR && + (DCI.getDAGCombineLevel() < AfterLegalizeDAG || + isOperationLegal(ISD::BUILD_VECTOR, DestVT))) { EVT SrcVT = Src.getValueType(); unsigned NElts = DestVT.getVectorNumElements(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 88ef4b577424..ad8dcda93c36 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2764,7 +2764,9 @@ static bool isConstant(const MachineInstr &MI) { void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const { - const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg()); + unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1; + const MachineInstr *PtrMI = + MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg()); assert(PtrMI); @@ -2817,6 +2819,10 @@ bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const { if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) return true; + if (MI.getOpcode() == AMDGPU::G_PREFETCH) + return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() == + AMDGPU::SGPRRegBankID; + const Instruction *I = dyn_cast<Instruction>(Ptr); return I && I->getMetadata("amdgpu.uniform"); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index fbee28889451..aa235c07e995 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -701,13 +701,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .maxScalar(0, S32); } - getActionDefinitionsBuilder(G_MUL) - .legalFor({S32, S16, V2S16}) - .clampMaxNumElementsStrict(0, S16, 2) - .scalarize(0) - .minScalar(0, S16) - .widenScalarToNextMultipleOf(0, 32) - .custom(); + if (ST.hasScalarSMulU64()) { + getActionDefinitionsBuilder(G_MUL) + .legalFor({S64, S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .custom(); + } else { + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .custom(); + } assert(ST.hasMad64_32()); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) @@ -1996,8 +2006,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, verify(*ST.getInstrInfo()); } -bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, - MachineInstr &MI) const { +bool AMDGPULegalizerInfo::legalizeCustom( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { MachineIRBuilder &B = Helper.MIRBuilder; MachineRegisterInfo &MRI = *B.getMRI(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h index 855fa0ddc214..56aabd4f6ab7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -27,7 +27,6 @@ class MachineIRBuilder; namespace AMDGPU { struct ImageDimIntrinsicInfo; } -/// This class provides the information for the target register banks. class AMDGPULegalizerInfo final : public LegalizerInfo { const GCNSubtarget &ST; @@ -35,7 +34,8 @@ public: AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM); - bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const override; Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index f03e6b8915b1..1b2f74cf153b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -87,9 +87,6 @@ private: Constant *copr0, Constant *copr1); bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo); - // sqrt - bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo); - /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value /// of cos, sincos call). std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg, @@ -672,8 +669,6 @@ bool AMDGPULibCalls::fold(CallInst *CI) { // Specialized optimizations for each function call. // - // TODO: Handle other simple intrinsic wrappers. Sqrt. - // // TODO: Handle native functions switch (FInfo.getId()) { case AMDGPULibFunc::EI_EXP: @@ -794,7 +789,9 @@ bool AMDGPULibCalls::fold(CallInst *CI) { case AMDGPULibFunc::EI_ROOTN: return fold_rootn(FPOp, B, FInfo); case AMDGPULibFunc::EI_SQRT: - return fold_sqrt(FPOp, B, FInfo); + // TODO: Allow with strictfp + constrained intrinsic + return tryReplaceLibcallWithSimpleIntrinsic( + B, CI, Intrinsic::sqrt, true, true, /*AllowStrictFP=*/false); case AMDGPULibFunc::EI_COS: case AMDGPULibFunc::EI_SIN: return fold_sincos(FPOp, B, FInfo); @@ -1273,29 +1270,6 @@ bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic( return true; } -// fold sqrt -> native_sqrt (x) -bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, - const FuncInfo &FInfo) { - if (!isUnsafeMath(FPOp)) - return false; - - if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) && - (FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) { - Module *M = B.GetInsertBlock()->getModule(); - - if (FunctionCallee FPExpr = getNativeFunction( - M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) { - Value *opr0 = FPOp->getOperand(0); - LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " - << "sqrt(" << *opr0 << ")\n"); - Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt"); - replaceCall(FPOp, nval); - return true; - } - } - return false; -} - std::tuple<Value *, Value *, Value *> AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B, FunctionCallee Fsincos) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 323462e60a29..31777295b4f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -19,6 +19,26 @@ using namespace llvm; +static const GlobalVariable * +getKernelDynLDSGlobalFromFunction(const Function &F) { + const Module *M = F.getParent(); + SmallString<64> KernelDynLDSName("llvm.amdgcn."); + KernelDynLDSName += F.getName(); + KernelDynLDSName += ".dynlds"; + return M->getNamedGlobal(KernelDynLDSName); +} + +static bool hasLDSKernelArgument(const Function &F) { + for (const Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + if (auto PtrTy = dyn_cast<PointerType>(ArgTy)) { + if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) + return true; + } + } + return false; +} + AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST) : IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())), @@ -65,6 +85,10 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F, Attribute NSZAttr = F.getFnAttribute("no-signed-zeros-fp-math"); NoSignedZerosFPMath = NSZAttr.isStringAttribute() && NSZAttr.getValueAsString() == "true"; + + const GlobalVariable *DynLdsGlobal = getKernelDynLDSGlobalFromFunction(F); + if (DynLdsGlobal || hasLDSKernelArgument(F)) + UsesDynamicLDS = true; } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, @@ -139,15 +163,6 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, return Offset; } -static const GlobalVariable * -getKernelDynLDSGlobalFromFunction(const Function &F) { - const Module *M = F.getParent(); - std::string KernelDynLDSName = "llvm.amdgcn."; - KernelDynLDSName += F.getName(); - KernelDynLDSName += ".dynlds"; - return M->getNamedGlobal(KernelDynLDSName); -} - std::optional<uint32_t> AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) { // TODO: Would be more consistent with the abs symbols to use a range @@ -210,3 +225,9 @@ void AMDGPUMachineFunction::setDynLDSAlign(const Function &F, } } } + +void AMDGPUMachineFunction::setUsesDynamicLDS(bool DynLDS) { + UsesDynamicLDS = DynLDS; +} + +bool AMDGPUMachineFunction::isDynamicLDSUsed() const { return UsesDynamicLDS; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 248ee26a47eb..7efb7f825348 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -46,6 +46,9 @@ protected: /// stages. Align DynLDSAlign; + // Flag to check dynamic LDS usage by kernel. + bool UsesDynamicLDS = false; + // Kernels + shaders. i.e. functions called by the hardware and not called // by other functions. bool IsEntryFunction = false; @@ -119,6 +122,10 @@ public: Align getDynLDSAlign() const { return DynLDSAlign; } void setDynLDSAlign(const Function &F, const GlobalVariable &GV); + + void setUsesDynamicLDS(bool DynLDS); + + bool isDynamicLDSUsed() const; }; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index 7b18e1f805d8..21bfab52c6c4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -104,6 +104,14 @@ public: void applyCombineSignExtendInReg(MachineInstr &MI, MachineInstr *&MatchInfo) const; + // Find the s_mul_u64 instructions where the higher bits are either + // zero-extended or sign-extended. + bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; + // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher + // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32 + // bits are zero extended. + void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; + private: #define GET_GICOMBINER_CLASS_MEMBERS #define AMDGPUSubtarget GCNSubtarget @@ -419,6 +427,32 @@ void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( + MachineInstr &MI, unsigned &NewOpcode) const { + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + if (MRI.getType(Src0) != LLT::scalar(64)) + return false; + + if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 && + KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) { + NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32; + return true; + } + + if (KB->computeNumSignBits(Src1) >= 33 && + KB->computeNumSignBits(Src0) >= 33) { + NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32; + return true; + } + return false; +} + +void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64( + MachineInstr &MI, unsigned &NewOpcode) const { + Helper.replaceOpcodeWith(MI, NewOpcode); +} + // Pass boilerplate // ================ diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index fba060464a6e..391c2b9ec256 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -441,7 +441,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( // FIXME: Returns uniform if there's no source value information. This is // probably wrong. -static bool isScalarLoadLegal(const MachineInstr &MI) { +bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const { if (!MI.hasOneMemOperand()) return false; @@ -2094,6 +2094,74 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( return true; } +// Break s_mul_u64 into 32-bit vector operations. +void AMDGPURegisterBankInfo::applyMappingSMULU64( + MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { + SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); + SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); + SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); + + // All inputs are SGPRs, nothing special to do. + if (DefRegs.empty()) { + assert(Src0Regs.empty() && Src1Regs.empty()); + applyDefaultMapping(OpdMapper); + return; + } + + assert(DefRegs.size() == 2); + assert(Src0Regs.size() == Src1Regs.size() && + (Src0Regs.empty() || Src0Regs.size() == 2)); + + MachineRegisterInfo &MRI = OpdMapper.getMRI(); + MachineInstr &MI = OpdMapper.getMI(); + Register DstReg = MI.getOperand(0).getReg(); + LLT HalfTy = LLT::scalar(32); + + // Depending on where the source registers came from, the generic code may + // have decided to split the inputs already or not. If not, we still need to + // extract the values. + + if (Src0Regs.empty()) + split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); + else + setRegsToType(MRI, Src0Regs, HalfTy); + + if (Src1Regs.empty()) + split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); + else + setRegsToType(MRI, Src1Regs, HalfTy); + + setRegsToType(MRI, DefRegs, HalfTy); + + // The multiplication is done as follows: + // + // Op1H Op1L + // * Op0H Op0L + // -------------------- + // Op1H*Op0L Op1L*Op0L + // + Op1H*Op0H Op1L*Op0H + // ----------------------------------------- + // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L + // + // We drop Op1H*Op0H because the result of the multiplication is a 64-bit + // value and that would overflow. + // The low 32-bit value is Op1L*Op0L. + // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from + // Op1L*Op0L). + + ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank); + + Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0); + Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0); + Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0); + Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0); + B.buildAdd(DefRegs[1], Add, MulHiLo); + B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]); + + MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); + MI.eraseFromParent(); +} + void AMDGPURegisterBankInfo::applyMappingImpl( MachineIRBuilder &B, const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -2394,13 +2462,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl( Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); + // Special case for s_mul_u64. There is not a vector equivalent of + // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector + // multiplications. + if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) { + applyMappingSMULU64(B, OpdMapper); + return; + } + // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. // Packed 16-bit operations need to be scalarized and promoted. if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16)) break; const RegisterBank *DstBank = - OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; if (DstBank == &AMDGPU::VGPRRegBank) break; @@ -2451,6 +2527,72 @@ void AMDGPURegisterBankInfo::applyMappingImpl( return; } + case AMDGPU::G_AMDGPU_S_MUL_I64_I32: + case AMDGPU::G_AMDGPU_S_MUL_U64_U32: { + // This is a special case for s_mul_u64. We use + // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation + // where the 33 higher bits are sign-extended and + // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation + // where the 32 higher bits are zero-extended. In case scalar registers are + // selected, both opcodes are lowered as s_mul_u64. If the vector registers + // are selected, then G_AMDGPU_S_MUL_I64_I32 and + // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction. + + // Insert basic copies. + applyDefaultMapping(OpdMapper); + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg0 = MI.getOperand(1).getReg(); + Register SrcReg1 = MI.getOperand(2).getReg(); + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 " + "that handles only 64-bit operands."); + const RegisterBank *DstBank = + OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + + // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 + // with s_mul_u64 operation. + if (DstBank == &AMDGPU::SGPRRegBank) { + MI.setDesc(TII->get(AMDGPU::S_MUL_U64)); + MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass); + MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass); + MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass); + return; + } + + // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32 + // with a vector mad. + assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank && + "The destination operand should be in vector registers."); + + DebugLoc DL = MI.getDebugLoc(); + + // Extract the lower subregister from the first operand. + Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass); + MRI.setType(Op0L, S32); + B.buildTrunc(Op0L, SrcReg0); + + // Extract the lower subregister from the second operand. + Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass); + MRI.setType(Op1L, S32); + B.buildTrunc(Op1L, SrcReg1); + + unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32 + ? AMDGPU::G_AMDGPU_MAD_U64_U32 + : AMDGPU::G_AMDGPU_MAD_I64_I32; + + MachineIRBuilder B(MI); + Register Zero64 = B.buildConstant(S64, 0).getReg(0); + MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass); + Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass); + B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64}); + MI.eraseFromParent(); + return; + } case AMDGPU::G_SEXT_INREG: { SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); if (SrcRegs.empty()) @@ -3263,17 +3405,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } - unsigned PtrBank = - getRegBankID(MI.getOperand(0).getReg(), MRI, AMDGPU::SGPRRegBankID); + Register PtrReg = MI.getOperand(0).getReg(); + unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID); if (PtrBank == AMDGPU::VGPRRegBankID) { MI.eraseFromParent(); return; } - // FIXME: There is currently no support for prefetch in global isel. - // There is no node equivalence and what's worse there is no MMO produced - // for a prefetch on global isel path. - // Prefetch does not affect execution so erase it for now. - MI.eraseFromParent(); + unsigned AS = MRI.getType(PtrReg).getAddressSpace(); + if (!AMDGPU::isFlatGlobalAddrSpace(AS) && + AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + MI.eraseFromParent(); + return; + } + applyDefaultMapping(OpdMapper); return; } default: @@ -3667,7 +3811,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_AND: case AMDGPU::G_OR: - case AMDGPU::G_XOR: { + case AMDGPU::G_XOR: + case AMDGPU::G_MUL: { unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); if (Size == 1) { const RegisterBank *DstBank @@ -3735,7 +3880,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_PTRMASK: case AMDGPU::G_ADD: case AMDGPU::G_SUB: - case AMDGPU::G_MUL: case AMDGPU::G_SHL: case AMDGPU::G_LSHR: case AMDGPU::G_ASHR: @@ -3753,6 +3897,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case AMDGPU::G_SHUFFLE_VECTOR: case AMDGPU::G_SBFX: case AMDGPU::G_UBFX: + case AMDGPU::G_AMDGPU_S_MUL_I64_I32: + case AMDGPU::G_AMDGPU_S_MUL_U64_U32: if (isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index b5d16e70ab23..5f550b426ec0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -84,6 +84,9 @@ public: bool applyMappingMAD_64_32(MachineIRBuilder &B, const OperandsMapper &OpdMapper) const; + void applyMappingSMULU64(MachineIRBuilder &B, + const OperandsMapper &OpdMapper) const; + Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const; @@ -173,6 +176,8 @@ public: const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override; + bool isScalarLoadLegal(const MachineInstr &MI) const; + InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index fdc2077868cf..0f3bb3e7b0d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -620,7 +620,8 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { AAM.registerFunctionAnalysis<AMDGPUAA>(); } -void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void AMDGPUTargetMachine::registerPassBuilderCallbacks( + PassBuilder &PB, bool PopulateClassToPassNames) { PB.registerPipelineParsingCallback( [this](StringRef PassName, ModulePassManager &PM, ArrayRef<PassBuilder::PipelineElement>) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 9051a61e6557..99c9db3e654a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -51,7 +51,8 @@ public: return TLOF.get(); } - void registerPassBuilderCallbacks(PassBuilder &PB) override; + void registerPassBuilderCallbacks(PassBuilder &PB, + bool PopulateClassToPassNames) override; void registerDefaultAliasAnalyses(AAManager &) override; /// Get the integer value of a null pointer in the given address space. diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index abd7e911beef..b7f043860115 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -166,6 +166,8 @@ public: ImmTyEndpgm, ImmTyWaitVDST, ImmTyWaitEXP, + ImmTyWaitVAVDst, + ImmTyWaitVMVSrc, }; // Immediate operand kind. @@ -909,6 +911,8 @@ public: bool isEndpgm() const; bool isWaitVDST() const; bool isWaitEXP() const; + bool isWaitVAVDst() const; + bool isWaitVMVSrc() const; auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const { return std::bind(P, *this); @@ -1029,6 +1033,7 @@ public: } static void printImmTy(raw_ostream& OS, ImmTy Type) { + // clang-format off switch (Type) { case ImmTyNone: OS << "None"; break; case ImmTyGDS: OS << "GDS"; break; @@ -1086,7 +1091,10 @@ public: case ImmTyEndpgm: OS << "Endpgm"; break; case ImmTyWaitVDST: OS << "WaitVDST"; break; case ImmTyWaitEXP: OS << "WaitEXP"; break; + case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break; + case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break; } + // clang-format on } void print(raw_ostream &OS) const override { @@ -1857,6 +1865,9 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_IMM_V2FP32: case AMDGPU::OPERAND_REG_INLINE_C_V2INT32: case AMDGPU::OPERAND_REG_IMM_V2INT32: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: return &APFloat::IEEEsingle(); @@ -1871,13 +1882,10 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) { case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: - case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_KIMM16: return &APFloat::IEEEhalf(); @@ -2025,9 +2033,14 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { // We allow fp literals with f16x2 operands assuming that the specified // literal goes into the lower half and the upper half is zero. We also // require that the literal may be losslessly converted to f16. - MVT ExpectedType = (type == MVT::v2f16)? MVT::f16 : - (type == MVT::v2i16)? MVT::i16 : - (type == MVT::v2f32)? MVT::f32 : type; + // + // For i16x2 operands, we assume that the specified literal is encoded as a + // single-precision float. This is pretty odd, but it matches SP3 and what + // happens in hardware. + MVT ExpectedType = (type == MVT::v2f16) ? MVT::f16 + : (type == MVT::v2i16) ? MVT::f32 + : (type == MVT::v2f32) ? MVT::f32 + : type; APFloat FPLiteral(APFloat::IEEEdouble(), APInt(64, Imm.Val)); return canLosslesslyConvertToFPType(FPLiteral, ExpectedType); @@ -3393,12 +3406,12 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 || OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 || OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16) - return AMDGPU::isInlinableIntLiteralV216(Val); + return AMDGPU::isInlinableLiteralV2I16(Val); if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 || OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 || OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) - return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm()); + return AMDGPU::isInlinableLiteralV2F16(Val); return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); } @@ -9192,6 +9205,14 @@ bool AMDGPUOperand::isWaitVDST() const { return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm()); } +bool AMDGPUOperand::isWaitVAVDst() const { + return isImmTy(ImmTyWaitVAVDst) && isUInt<4>(getImm()); +} + +bool AMDGPUOperand::isWaitVMVSrc() const { + return isImmTy(ImmTyWaitVMVSrc) && isUInt<1>(getImm()); +} + //===----------------------------------------------------------------------===// // VINTERP //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 43d35fa5291c..9e99d382ed9b 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -503,7 +503,6 @@ class MUBUF_Load_Pseudo <string opName, let has_vdata = !not(!or(isLds, isLdsOpc)); let mayLoad = 1; let mayStore = isLds; - let maybeAtomic = 1; let Uses = !if(!or(isLds, isLdsOpc) , [EXEC, M0], [EXEC]); let tfe = isTFE; let lds = isLds; @@ -610,7 +609,6 @@ class MUBUF_Store_Pseudo <string opName, getAddrName<addrKindCopy>.ret; let mayLoad = 0; let mayStore = 1; - let maybeAtomic = 1; let elements = getMUBUFElements<store_vt>.ret; let tfe = isTFE; } @@ -671,7 +669,6 @@ class MUBUF_Pseudo_Store_Lds<string opName> let LGKM_CNT = 1; let mayLoad = 1; let mayStore = 1; - let maybeAtomic = 1; let has_vdata = 0; let has_vaddr = 0; @@ -735,7 +732,6 @@ class MUBUF_Atomic_Pseudo<string opName, let has_glc = 0; let has_dlc = 0; let has_sccb = 1; - let maybeAtomic = 1; let AsmMatchConverter = "cvtMubufAtomic"; } @@ -1222,8 +1218,10 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < } // End HasD16LoadStore -def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1", - int_amdgcn_buffer_wbinvl1>; +let SubtargetPredicate = isNotGFX12Plus in +def BUFFER_WBINVL1 : MUBUF_Invalidate < + "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1 +>; let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN< diff --git a/llvm/lib/Target/AMDGPU/DSDIRInstructions.td b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td new file mode 100644 index 000000000000..4416da605981 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/DSDIRInstructions.td @@ -0,0 +1,192 @@ +//===-- DSDIRInstructions.td - LDS/VDS Direct Instruction Definitions -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// LDSDIR/VDSDIR encoding (LDSDIR is gfx11, VDSDIR is gfx12+) +//===----------------------------------------------------------------------===// + +class LDSDIRe<bits<2> op, bit is_direct> : Enc32 { + // encoding fields + bits<2> attrchan; + bits<6> attr; + bits<4> waitvdst; + bits<8> vdst; + + // encoding + let Inst{31-24} = 0xce; // encoding + let Inst{23-22} = 0x0; // reserved + let Inst{21-20} = op; + let Inst{19-16} = waitvdst; + let Inst{15-10} = !if(is_direct, ?, attr); + let Inst{9-8} = !if(is_direct, ?, attrchan); + let Inst{7-0} = vdst; +} + +class VDSDIRe<bits<2> op, bit is_direct> : Enc32 { + // encoding fields + bits<2> attrchan; + bits<6> attr; + bits<4> waitvdst; + bits<8> vdst; + bits<1> waitvsrc; + + // encoding + let Inst{31-24} = 0xce; // encoding + let Inst{23} = waitvsrc; + let Inst{22} = 0x0; // reserved + let Inst{21-20} = op; + let Inst{19-16} = waitvdst; + let Inst{15-10} = !if(is_direct, ?, attr); + let Inst{9-8} = !if(is_direct, ?, attrchan); + let Inst{7-0} = vdst; +} + +//===----------------------------------------------------------------------===// +// LDSDIR/VDSDIR Classes +//===----------------------------------------------------------------------===// + +class LDSDIR_getIns<bit direct> { + dag ret = !if(direct, + (ins wait_vdst:$waitvdst), + (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst) + ); +} + +class VDSDIR_getIns<bit direct> { + dag ret = !if(direct, + (ins wait_va_vdst:$waitvdst, wait_va_vsrc:$waitvsrc), + (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_va_vdst:$waitvdst, + wait_va_vsrc:$waitvsrc) + ); +} + +class DSDIR_Common<string opName, string asm = "", dag ins, bit direct> : + InstSI<(outs VGPR_32:$vdst), ins, asm> { + let LDSDIR = 1; + let EXP_CNT = 1; + + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; + let maybeAtomic = 0; + + string Mnemonic = opName; + let UseNamedOperandTable = 1; + + let Uses = [M0, EXEC]; + let DisableWQM = 0; + let SchedRW = [WriteLDS]; + + bit is_direct; + let is_direct = direct; +} + +class DSDIR_Pseudo<string opName, dag ins, bit direct> : + DSDIR_Common<opName, "", ins, direct>, + SIMCInstr<opName, SIEncodingFamily.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} + +class LDSDIR_getAsm<bit direct> { + string ret = !if(direct, + " $vdst$waitvdst", + " $vdst, $attr$attrchan$waitvdst" + ); +} + +class VDSDIR_getAsm<bit direct> { + string ret = !if(direct, + " $vdst$waitvdst$waitvsrc", + " $vdst, $attr$attrchan$waitvdst$waitvsrc" + ); +} + +class DSDIR_Real<DSDIR_Pseudo lds, dag ins, string asm, int subtarget> : + DSDIR_Common<lds.Mnemonic, + lds.Mnemonic # asm, + ins, + lds.is_direct>, + SIMCInstr <lds.Mnemonic, subtarget> { + let isPseudo = 0; + let isCodeGenOnly = 0; +} + +//===----------------------------------------------------------------------===// +// LDS/VDS Direct Instructions +//===----------------------------------------------------------------------===// + +let SubtargetPredicate = isGFX11Only in { + +def LDS_DIRECT_LOAD : DSDIR_Pseudo<"lds_direct_load", LDSDIR_getIns<1>.ret, 1>; +def LDS_PARAM_LOAD : DSDIR_Pseudo<"lds_param_load", LDSDIR_getIns<0>.ret, 0>; + +def : GCNPat < + (f32 (int_amdgcn_lds_direct_load M0)), + (LDS_DIRECT_LOAD 0) +>; + +def : GCNPat < + (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)), + (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0) +>; + +} // End SubtargetPredicate = isGFX11Only + +let SubtargetPredicate = isGFX12Plus in { + +def DS_DIRECT_LOAD : DSDIR_Pseudo<"ds_direct_load", VDSDIR_getIns<1>.ret, 1>; +def DS_PARAM_LOAD : DSDIR_Pseudo<"ds_param_load", VDSDIR_getIns<0>.ret, 0>; + +def : GCNPat < + (f32 (int_amdgcn_lds_direct_load M0)), + (DS_DIRECT_LOAD 0, 1) +>; + +def : GCNPat < + (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)), + (DS_PARAM_LOAD timm:$attr, timm:$attrchan, 0, 1) +>; + +} // End SubtargetPredicate = isGFX12Only + +//===----------------------------------------------------------------------===// +// GFX11 +//===----------------------------------------------------------------------===// + +multiclass DSDIR_Real_gfx11<bits<2> op, + DSDIR_Pseudo lds = !cast<DSDIR_Pseudo>(NAME)> { + def _gfx11 : DSDIR_Real<lds, lds.InOperandList, + LDSDIR_getAsm<lds.is_direct>.ret, + SIEncodingFamily.GFX11>, + LDSDIRe<op, lds.is_direct> { + let AssemblerPredicate = isGFX11Only; + let DecoderNamespace = "GFX11"; + } +} + +defm LDS_PARAM_LOAD : DSDIR_Real_gfx11<0x0>; +defm LDS_DIRECT_LOAD : DSDIR_Real_gfx11<0x1>; + +//===----------------------------------------------------------------------===// +// GFX12+ +//===----------------------------------------------------------------------===// + +multiclass DSDIR_Real_gfx12<bits<2> op, + DSDIR_Pseudo lds = !cast<DSDIR_Pseudo>(NAME)> { + def _gfx12 : DSDIR_Real<lds, lds.InOperandList, + VDSDIR_getAsm<lds.is_direct>.ret, + SIEncodingFamily.GFX12>, + VDSDIRe<op, lds.is_direct> { + let AssemblerPredicate = isGFX12Plus; + let DecoderNamespace = "GFX12"; + } +} + +defm DS_PARAM_LOAD : DSDIR_Real_gfx12<0x0>; +defm DS_DIRECT_LOAD : DSDIR_Real_gfx12<0x1>; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index bc9049b4ef33..3cccd8c50e66 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -19,7 +19,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt // Most instruction load and store data, so set this as the default. let mayLoad = 1; let mayStore = 1; - let maybeAtomic = 1; let hasSideEffects = 0; let SchedRW = [WriteLDS]; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 67be7b0fd642..9dff3f6c2efd 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -182,6 +182,9 @@ static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \ false, ImmWidth) +#define DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(Name, OpWidth, ImmWidth) \ + DECODE_SrcOp(decodeOperand_##Name, 9, OpWidth, Imm, false, ImmWidth) + // Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc) // and decode using 'enum10' from decodeSrcOp. #define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \ @@ -262,6 +265,9 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32) DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2I16, OPW32, 32) +DECODE_OPERAND_SRC_REG_OR_IMM_9_TYPED(VS_32_ImmV2F16, OPW32, 16) + DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64) DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32) DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64) diff --git a/llvm/lib/Target/AMDGPU/EXPInstructions.td b/llvm/lib/Target/AMDGPU/EXPInstructions.td index ff1d661ef6fe..4cfee7d013ef 100644 --- a/llvm/lib/Target/AMDGPU/EXPInstructions.td +++ b/llvm/lib/Target/AMDGPU/EXPInstructions.td @@ -20,6 +20,7 @@ class EXPCommon<bit row, bit done, string asm = ""> : InstSI< let EXP_CNT = 1; let mayLoad = done; let mayStore = 1; + let maybeAtomic = 0; let UseNamedOperandTable = 1; let Uses = !if(row, [EXEC, M0], [EXEC]); let SchedRW = [WriteExport]; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 615f8cd54d8f..16a8b770e057 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -60,6 +60,7 @@ class FLAT_Pseudo<string opName, dag outs, dag ins, bits<1> has_sve = 0; // Scratch VGPR Enable bits<1> lds = 0; bits<1> sve = 0; + bits<1> has_offset = 1; let SubtargetPredicate = !if(is_flat_global, HasFlatGlobalInsts, !if(is_flat_scratch, HasFlatScratchInsts, HasFlatAddressSpace)); @@ -182,7 +183,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{51-50} = cpol{4-3}; // scope let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?); let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?); - let Inst{95-72} = offset; + let Inst{95-72} = !if(ps.has_offset, offset, ?); } class GlobalSaddrTable <bit is_saddr, string Name = ""> { @@ -214,7 +215,6 @@ class FLAT_Load_Pseudo <string opName, RegisterClass regClass, let has_saddr = HasSaddr; let enabled_saddr = EnableSaddr; let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", ""); - let maybeAtomic = 1; let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); @@ -236,7 +236,6 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, let has_saddr = HasSaddr; let enabled_saddr = EnableSaddr; let PseudoInstr = opName#!if(!and(HasSaddr, EnableSaddr), "_SADDR", ""); - let maybeAtomic = 1; } multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { @@ -262,7 +261,6 @@ class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass, let has_vaddr = 0; let has_saddr = 1; let enabled_saddr = EnableSaddr; - let maybeAtomic = 1; let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); @@ -329,7 +327,6 @@ class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass, let has_vaddr = 0; let has_saddr = 1; let enabled_saddr = EnableSaddr; - let maybeAtomic = 1; let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); } @@ -340,6 +337,34 @@ multiclass FLAT_Global_Store_AddTid_Pseudo<string opName, RegisterClass regClass GlobalSaddrTable<1, opName>; } +class FLAT_Global_Invalidate_Writeback<string opName, SDPatternOperator node = null_frag> : + FLAT_Pseudo<opName, (outs), (ins CPol:$cpol), "$cpol", [(node)]> { + + let AsmMatchConverter = ""; + + let hasSideEffects = 1; + let mayLoad = 0; + let mayStore = 0; + let is_flat_global = 1; + + let has_offset = 0; + let has_saddr = 0; + let enabled_saddr = 0; + let saddr_value = 0; + let has_vdst = 0; + let has_data = 0; + let has_vaddr = 0; + let has_glc = 0; + let has_dlc = 0; + let glcValue = 0; + let dlcValue = 0; + let has_sccb = 0; + let sccbValue = 0; + let has_sve = 0; + let lds = 0; + let sve = 0; +} + class FlatScratchInst <string sv_op, string mode> { string SVOp = sv_op; string Mode = mode; @@ -372,7 +397,6 @@ class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass, let has_sve = EnableSVE; let sve = EnableVaddr; let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"))); - let maybeAtomic = 1; let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); @@ -401,7 +425,6 @@ class FLAT_Scratch_Store_Pseudo <string opName, RegisterClass vdataClass, bit En let has_sve = EnableSVE; let sve = EnableVaddr; let PseudoInstr = opName#!if(EnableSVE, "_SVS", !if(EnableSaddr, "_SADDR", !if(EnableVaddr, "", "_ST"))); - let maybeAtomic = 1; } multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> { @@ -491,7 +514,6 @@ class FLAT_AtomicNoRet_Pseudo<string opName, dag outs, dag ins, let has_vdst = 0; let has_sccb = 1; let sccbValue = 0; - let maybeAtomic = 1; let IsAtomicNoRet = 1; } @@ -928,6 +950,10 @@ defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dwor let SubtargetPredicate = isGFX12Plus in { defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>; + + def GLOBAL_INV : FLAT_Global_Invalidate_Writeback<"global_inv">; + def GLOBAL_WB : FLAT_Global_Invalidate_Writeback<"global_wb">; + def GLOBAL_WBINV : FLAT_Global_Invalidate_Writeback<"global_wbinv">; } // End SubtargetPredicate = isGFX12Plus } // End is_flat_global = 1 @@ -2662,6 +2688,10 @@ defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_A defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">; defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">; +defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">; +defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">; +defm GLOBAL_WBINV : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">; + // ENC_VSCRATCH. defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>; defm SCRATCH_LOAD_I8 : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 91a709303269..f6f37f5170a4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -176,6 +176,7 @@ protected: bool HasGetWaveIdInst = false; bool HasSMemTimeInst = false; bool HasShaderCyclesRegister = false; + bool HasShaderCyclesHiLoRegisters = false; bool HasVOP3Literal = false; bool HasNoDataDepHazard = false; bool FlatAddressSpace = false; @@ -682,6 +683,8 @@ public: bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } + bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } + bool hasUnpackedD16VMem() const { return HasUnpackedD16VMem; } @@ -819,6 +822,10 @@ public: return HasShaderCyclesRegister; } + bool hasShaderCyclesHiLoRegisters() const { + return HasShaderCyclesHiLoRegisters; + } + bool hasVOP3Literal() const { return HasVOP3Literal; } @@ -1096,7 +1103,7 @@ public: bool hasDstSelForwardingHazard() const { return GFX940Insts; } // Cannot use op_sel with v_dot instructions. - bool hasDOTOpSelHazard() const { return GFX940Insts; } + bool hasDOTOpSelHazard() const { return GFX940Insts || GFX11Insts; } // Does not have HW interlocs for VALU writing and then reading SGPRs. bool hasVDecCoExecHazard() const { diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td deleted file mode 100644 index 4956a1586774..000000000000 --- a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td +++ /dev/null @@ -1,116 +0,0 @@ -//===-- LDSDIRInstructions.td - LDS Direct Instruction Definitions --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// LDSDIR encoding -//===----------------------------------------------------------------------===// - -class LDSDIRe<bits<2> op, bit is_direct> : Enc32 { - // encoding fields - bits<2> attrchan; - bits<6> attr; - bits<4> waitvdst; - bits<8> vdst; - - // encoding - let Inst{31-24} = 0xce; // encoding - let Inst{23-22} = 0x0; // reserved - let Inst{21-20} = op; - let Inst{19-16} = waitvdst; - let Inst{15-10} = !if(is_direct, ?, attr); - let Inst{9-8} = !if(is_direct, ?, attrchan); - let Inst{7-0} = vdst; -} - -//===----------------------------------------------------------------------===// -// LDSDIR Classes -//===----------------------------------------------------------------------===// - -class LDSDIR_getIns<bit direct> { - dag ret = !if(direct, - (ins wait_vdst:$waitvdst), - (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst) - ); -} - -class LDSDIR_Common<string opName, string asm = "", bit direct> : InstSI< - (outs VGPR_32:$vdst), - LDSDIR_getIns<direct>.ret, - asm> { - let LDSDIR = 1; - let EXP_CNT = 1; - - let hasSideEffects = 0; - let mayLoad = 1; - let mayStore = 0; - - string Mnemonic = opName; - let UseNamedOperandTable = 1; - - let Uses = [M0, EXEC]; - let DisableWQM = 0; - let SchedRW = [WriteLDS]; - - bit is_direct; - let is_direct = direct; -} - -class LDSDIR_Pseudo<string opName, bit direct> : - LDSDIR_Common<opName, "", direct>, - SIMCInstr<opName, SIEncodingFamily.NONE> { - let isPseudo = 1; - let isCodeGenOnly = 1; -} - -class LDSDIR_getAsm<bit direct> { - string ret = !if(direct, - " $vdst$waitvdst", - " $vdst, $attr$attrchan$waitvdst" - ); -} - -class LDSDIR_Real<bits<2> op, LDSDIR_Pseudo lds, int subtarget> : - LDSDIR_Common<lds.Mnemonic, - lds.Mnemonic # LDSDIR_getAsm<lds.is_direct>.ret, - lds.is_direct>, - SIMCInstr <lds.Mnemonic, subtarget>, - LDSDIRe<op, lds.is_direct> { - let isPseudo = 0; - let isCodeGenOnly = 0; -} - -//===----------------------------------------------------------------------===// -// LDS Direct Instructions -//===----------------------------------------------------------------------===// - -def LDS_DIRECT_LOAD : LDSDIR_Pseudo<"lds_direct_load", 1>; -def LDS_PARAM_LOAD : LDSDIR_Pseudo<"lds_param_load", 0>; - -def : GCNPat < - (f32 (int_amdgcn_lds_direct_load M0)), - (LDS_DIRECT_LOAD 0) ->; - -def : GCNPat < - (f32 (int_amdgcn_lds_param_load timm:$attrchan, timm:$attr, M0)), - (LDS_PARAM_LOAD timm:$attr, timm:$attrchan, 0) ->; - -//===----------------------------------------------------------------------===// -// GFX11+ -//===----------------------------------------------------------------------===// - -multiclass LDSDIR_Real_gfx11<bits<2> op, LDSDIR_Pseudo lds = !cast<LDSDIR_Pseudo>(NAME)> { - def _gfx11 : LDSDIR_Real<op, lds, SIEncodingFamily.GFX11> { - let AssemblerPredicate = isGFX11Plus; - let DecoderNamespace = "GFX11"; - } -} - -defm LDS_PARAM_LOAD : LDSDIR_Real_gfx11<0x0>; -defm LDS_DIRECT_LOAD : LDSDIR_Real_gfx11<0x1>; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index edc244db613d..6c7977e22599 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -460,56 +460,84 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, } } -void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - int16_t SImm = static_cast<int16_t>(Imm); - if (isInlinableIntLiteral(SImm)) { - O << SImm; - return; - } - +// This must accept a 32-bit immediate value to correctly handle packed 16-bit +// operations. +static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O) { if (Imm == 0x3C00) - O<< "1.0"; + O << "1.0"; else if (Imm == 0xBC00) - O<< "-1.0"; + O << "-1.0"; else if (Imm == 0x3800) - O<< "0.5"; + O << "0.5"; else if (Imm == 0xB800) - O<< "-0.5"; + O << "-0.5"; else if (Imm == 0x4000) - O<< "2.0"; + O << "2.0"; else if (Imm == 0xC000) - O<< "-2.0"; + O << "-2.0"; else if (Imm == 0x4400) - O<< "4.0"; + O << "4.0"; else if (Imm == 0xC400) - O<< "-4.0"; - else if (Imm == 0x3118 && - STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) { + O << "-4.0"; + else if (Imm == 0x3118 && STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494"; - } else { - uint64_t Imm16 = static_cast<uint16_t>(Imm); - O << formatHex(Imm16); - } -} + else + return false; -void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O) { - uint16_t Lo16 = static_cast<uint16_t>(Imm); - printImmediate16(Lo16, STI, O); + return true; } -void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, +void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { + int16_t SImm = static_cast<int16_t>(Imm); + if (isInlinableIntLiteral(SImm)) { + O << SImm; + return; + } + + uint16_t HImm = static_cast<uint16_t>(Imm); + if (printImmediateFloat16(HImm, STI, O)) + return; + + uint64_t Imm16 = static_cast<uint16_t>(Imm); + O << formatHex(Imm16); +} + +void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, + const MCSubtargetInfo &STI, + raw_ostream &O) { int32_t SImm = static_cast<int32_t>(Imm); - if (SImm >= -16 && SImm <= 64) { + if (isInlinableIntLiteral(SImm)) { O << SImm; return; } + switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + if (printImmediateFloat32(Imm, STI, O)) + return; + break; + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + if (isUInt<16>(Imm) && + printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O)) + return; + break; + default: + llvm_unreachable("bad operand type"); + } + + O << formatHex(static_cast<uint64_t>(Imm)); +} + +bool AMDGPUInstPrinter::printImmediateFloat32(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { if (Imm == llvm::bit_cast<uint32_t>(0.0f)) O << "0.0"; else if (Imm == llvm::bit_cast<uint32_t>(1.0f)) @@ -532,7 +560,24 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494"; else - O << formatHex(static_cast<uint64_t>(Imm)); + return false; + + return true; +} + +void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int32_t SImm = static_cast<int32_t>(Imm); + if (isInlinableIntLiteral(SImm)) { + O << SImm; + return; + } + + if (printImmediateFloat32(Imm, STI, O)) + return; + + O << formatHex(static_cast<uint64_t>(Imm)); } void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, @@ -639,6 +684,20 @@ void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo, printU4ImmDecOperand(MI, OpNo, O); } +void AMDGPUInstPrinter::printWaitVAVDst(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << " wait_va_vdst:"; + printU4ImmDecOperand(MI, OpNo, O); +} + +void AMDGPUInstPrinter::printWaitVMVSrc(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + O << " wait_vm_vsrc:"; + printU4ImmDecOperand(MI, OpNo, O); +} + void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -741,25 +800,11 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, break; case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: - if (!isUInt<16>(Op.getImm()) && - STI.hasFeature(AMDGPU::FeatureVOP3Literal)) { - printImmediate32(Op.getImm(), STI, O); - break; - } - - // Deal with 16-bit FP inline immediates not working. - if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) { - printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O); - break; - } - [[fallthrough]]; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O); - break; case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: - printImmediateV216(Op.getImm(), STI, O); + printImmediateV216(Op.getImm(), OpTy, STI, O); break; case MCOI::OPERAND_UNKNOWN: case MCOI::OPERAND_PCREL: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 95c26de6299e..e3958f88277d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -88,8 +88,10 @@ private: raw_ostream &O); void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); - void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); + void printImmediateV216(uint32_t Imm, uint8_t OpType, + const MCSubtargetInfo &STI, raw_ostream &O); + bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, @@ -161,6 +163,10 @@ private: raw_ostream &O); void printWaitEXP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printWaitVAVDst(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); + void printWaitVMVSrc(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printExpSrcN(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O, unsigned N); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index b403d69d9ff1..de1abaf29c56 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -284,22 +284,15 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO, // which does not have f16 support? return getLit16Encoding(static_cast<uint16_t>(Imm), STI); case AMDGPU::OPERAND_REG_IMM_V2INT16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: { - if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal)) - return getLit32Encoding(static_cast<uint32_t>(Imm), STI); - if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) - return getLit16Encoding(static_cast<uint16_t>(Imm), STI); - [[fallthrough]]; - } case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI); + return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm)) + .value_or(255); + case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { - uint16_t Lo16 = static_cast<uint16_t>(Imm); - uint32_t Encoding = getLit16Encoding(Lo16, STI); - return Encoding; - } + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm)) + .value_or(255); case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: return MO.getImm(); diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 9a2fb0bc37b2..674fd04f2fc1 100644 --- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1651,7 +1651,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); for (unsigned i = 0; i < 4; i++) { - unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); + unsigned Idx = Swz[i]->getAsZExtVal(); if (SwizzleRemap.contains(Idx)) Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); } @@ -1659,7 +1659,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SwizzleRemap.clear(); BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); for (unsigned i = 0; i < 4; i++) { - unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); + unsigned Idx = Swz[i]->getAsZExtVal(); if (SwizzleRemap.contains(Idx)) Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); } @@ -1780,7 +1780,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, // Check that we know which element is being inserted if (!isa<ConstantSDNode>(EltNo)) return SDValue(); - unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); + unsigned Elt = EltNo->getAsZExtVal(); // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the @@ -2021,7 +2021,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, } case R600::MOV_IMM_GLOBAL_ADDR: // Check if the Imm slot is used. Taken from below. - if (cast<ConstantSDNode>(Imm)->getZExtValue()) + if (Imm->getAsZExtVal()) return false; Imm = Src.getOperand(0); Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 709de612d81d..aa7639a0f186 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -208,9 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const { assert(Old.isReg() && Fold.isImm()); if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) || - (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) || - isUInt<16>(Fold.ImmToFold) || - !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm())) + (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT))) return false; unsigned Opcode = MI->getOpcode(); @@ -234,42 +232,123 @@ bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const { MachineOperand &Old = MI->getOperand(Fold.UseOpNo); unsigned Opcode = MI->getOpcode(); int OpNo = MI->getOperandNo(&Old); + uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType; + + // If the literal can be inlined as-is, apply it and short-circuit the + // tests below. The main motivation for this is to avoid unintuitive + // uses of opsel. + if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) { + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } - // Set op_sel/op_sel_hi on this operand or bail out if op_sel is - // already set. + // Refer to op_sel/op_sel_hi and check if we can change the immediate and + // op_sel in a way that allows an inline constant. int ModIdx = -1; - if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) + unsigned SrcIdx = ~0; + if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) { ModIdx = AMDGPU::OpName::src0_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) + SrcIdx = 0; + } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) { ModIdx = AMDGPU::OpName::src1_modifiers; - else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) + SrcIdx = 1; + } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) { ModIdx = AMDGPU::OpName::src2_modifiers; + SrcIdx = 2; + } assert(ModIdx != -1); ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx); MachineOperand &Mod = MI->getOperand(ModIdx); - unsigned Val = Mod.getImm(); - if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) + unsigned ModVal = Mod.getImm(); + + uint16_t ImmLo = static_cast<uint16_t>( + Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0)); + uint16_t ImmHi = static_cast<uint16_t>( + Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0)); + uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo; + unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + + // Helper function that attempts to inline the given value with a newly + // chosen opsel pattern. + auto tryFoldToInline = [&](uint32_t Imm) -> bool { + if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_1); + Old.ChangeToImmediate(Imm); + return true; + } + + // Try to shuffle the halves around and leverage opsel to get an inline + // constant. + uint16_t Lo = static_cast<uint16_t>(Imm); + uint16_t Hi = static_cast<uint16_t>(Imm >> 16); + if (Lo == Hi) { + if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) { + Mod.setImm(NewModVal); + Old.ChangeToImmediate(Lo); + return true; + } + + if (static_cast<int16_t>(Lo) < 0) { + int32_t SExt = static_cast<int16_t>(Lo); + if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) { + Mod.setImm(NewModVal); + Old.ChangeToImmediate(SExt); + return true; + } + } + + // This check is only useful for integer instructions + if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 || + OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) { + if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1); + Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16); + return true; + } + } + } else { + uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi; + if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) { + Mod.setImm(NewModVal | SISrcMods::OP_SEL_0); + Old.ChangeToImmediate(Swapped); + return true; + } + } + return false; + }; - // Only apply the following transformation if that operand requires - // a packed immediate. - // If upper part is all zero we do not need op_sel_hi. - if (!(Fold.ImmToFold & 0xffff)) { - MachineOperand New = - MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff); - if (!TII->isOperandLegal(*MI, OpNo, &New)) - return false; - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + if (tryFoldToInline(Imm)) return true; + + // Replace integer addition by subtraction and vice versa if it allows + // folding the immediate to an inline constant. + // + // We should only ever get here for SrcIdx == 1 due to canonicalization + // earlier in the pipeline, but we double-check here to be safe / fully + // general. + bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16; + bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16; + if (SrcIdx == 1 && (IsUAdd || IsUSub)) { + unsigned ClampIdx = + AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp); + bool Clamp = MI->getOperand(ClampIdx).getImm() != 0; + + if (!Clamp) { + uint16_t NegLo = -static_cast<uint16_t>(Imm); + uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16); + uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo; + + if (tryFoldToInline(NegImm)) { + unsigned NegOpcode = + IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16; + MI->setDesc(TII->get(NegOpcode)); + return true; + } + } } - MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff); - if (!TII->isOperandLegal(*MI, OpNo, &New)) - return false; - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); - return true; + + return false; } bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { @@ -277,8 +356,19 @@ bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const { MachineOperand &Old = MI->getOperand(Fold.UseOpNo); assert(Old.isReg()); - if (Fold.isImm() && canUseImmWithOpSel(Fold)) - return tryFoldImmWithOpSel(Fold); + if (Fold.isImm() && canUseImmWithOpSel(Fold)) { + if (tryFoldImmWithOpSel(Fold)) + return true; + + // We can't represent the candidate as an inline constant. Try as a literal + // with the original opsel, checking constant bus limitations. + MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold); + int OpNo = MI->getOperandNo(&Old); + if (!TII->isOperandLegal(*MI, OpNo, &New)) + return false; + Old.ChangeToImmediate(Fold.ImmToFold); + return true; + } if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0e857e6ac71b..6ddc7e864fb2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -151,22 +151,29 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->useRealTrue16Insts()) { addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass); addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); + addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass); } else { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass); } // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); + addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); + addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -196,6 +203,41 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::i1, MVT::v32i32}, Custom); + if (isTypeLegal(MVT::bf16)) { + for (unsigned Opc : + {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV, + ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM, + ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT, + ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI, + ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2, + ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, + ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT, + ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE, + ISD::SETCC}) { + // FIXME: The promoted to type shouldn't need to be explicit + setOperationAction(Opc, MVT::bf16, Promote); + AddPromotedToType(Opc, MVT::bf16, MVT::f32); + } + + setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand); + + setOperationAction(ISD::SELECT, MVT::bf16, Promote); + AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16); + + // TODO: Could make these legal + setOperationAction(ISD::FABS, MVT::bf16, Expand); + setOperationAction(ISD::FNEG, MVT::bf16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand); + + // We only need to custom lower because we can't specify an action for bf16 + // sources. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + + setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16); + } + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); @@ -271,13 +313,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : - {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, - MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, - MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, - MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, - MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, - MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) { + {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, + MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, + MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, + MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32, + MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, + MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, + MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32, + MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -383,13 +426,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}, Expand); - setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom); + setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, + Custom); // Avoid stack access for these. // TODO: Generalize to more vector types. setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}, - {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v4i16, MVT::v4f16}, + {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8, + MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16}, Custom); // Deal with vec3 vector operations when widened to vec4. @@ -498,6 +542,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand); setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand); + // Custom lower these because we can't specify a rule based on an illegal + // source bf16. + setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom); + setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom); + if (Subtarget->has16BitInsts()) { setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT}, @@ -524,9 +573,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom); + setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); + setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); + + setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom); // F16 - Constant Actions. setOperationAction(ISD::ConstantFP, MVT::f16, Legal); + setOperationAction(ISD::ConstantFP, MVT::bf16, Legal); // F16 - Load/Store Actions. setOperationAction(ISD::LOAD, MVT::f16, Promote); @@ -534,16 +588,23 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::f16, Promote); AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16); + // BF16 - Load/Store Actions. + setOperationAction(ISD::LOAD, MVT::bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16); + setOperationAction(ISD::STORE, MVT::bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16); + // F16 - VOP1 Actions. setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, MVT::f16, Custom); - setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); + setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote); // F16 - VOP2 Actions. - setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand); + setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16}, + Expand); setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom); setOperationAction(ISD::FFREXP, MVT::f16, Custom); setOperationAction(ISD::FDIV, MVT::f16, Custom); @@ -554,8 +615,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : - {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) { + {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16, + MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, + MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -587,7 +649,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // XXX - Do these do anything? Vector constants turn into build_vector. setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal); - setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal); + setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, + Legal); setOperationAction(ISD::STORE, MVT::v2i16, Promote); AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32); @@ -610,16 +673,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32); setOperationAction(ISD::LOAD, MVT::v4f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32); + setOperationAction(ISD::LOAD, MVT::v4bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v4i16, Promote); AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); setOperationAction(ISD::STORE, MVT::v4f16, Promote); AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32); + setOperationAction(ISD::STORE, MVT::v4bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32); setOperationAction(ISD::LOAD, MVT::v8i16, Promote); AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32); setOperationAction(ISD::LOAD, MVT::v8f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v8bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32); setOperationAction(ISD::STORE, MVT::v4i16, Promote); AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32); @@ -630,26 +699,36 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32); setOperationAction(ISD::STORE, MVT::v8f16, Promote); AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v8bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32); setOperationAction(ISD::LOAD, MVT::v16i16, Promote); AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32); setOperationAction(ISD::LOAD, MVT::v16f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v16bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32); setOperationAction(ISD::STORE, MVT::v16i16, Promote); AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32); setOperationAction(ISD::STORE, MVT::v16f16, Promote); AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); + setOperationAction(ISD::STORE, MVT::v16bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32); setOperationAction(ISD::LOAD, MVT::v32i16, Promote); AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); setOperationAction(ISD::LOAD, MVT::v32f16, Promote); AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); + setOperationAction(ISD::LOAD, MVT::v32bf16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32); setOperationAction(ISD::STORE, MVT::v32i16, Promote); AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); setOperationAction(ISD::STORE, MVT::v32f16, Promote); AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::v32bf16, Promote); + AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32); setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, MVT::v2i32, Expand); @@ -662,7 +741,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::v8i32, Expand); if (!Subtarget->hasVOP3PInsts()) - setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom); + setOperationAction(ISD::BUILD_VECTOR, + {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom); setOperationAction(ISD::FNEG, MVT::v2f16, Legal); // This isn't really legal, but this avoids the legalizer unrolling it (and @@ -680,8 +760,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, Expand); - for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, - MVT::v32i16, MVT::v32f16}) { + for (MVT Vec16 : + {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16, + MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) { setOperationAction( {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, Vec16, Custom); @@ -699,7 +780,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE}, MVT::v2f16, Legal); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16}, + setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, @@ -724,7 +805,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, Custom); setOperationAction(ISD::FEXP, MVT::v2f16, Custom); - setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom); + setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, + Custom); if (Subtarget->hasPackedFP32Ops()) { setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, @@ -750,13 +832,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } setOperationAction(ISD::SELECT, - {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, - MVT::v32i16, MVT::v32f16}, + {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8, + MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16, + MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16, + MVT::v32f16, MVT::v32bf16}, Custom); setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); + if (Subtarget->hasScalarSMulU64()) + setOperationAction(ISD::MUL, MVT::i64, Custom); + if (Subtarget->hasMad64_32()) setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom); @@ -3902,6 +3988,26 @@ SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { return Op; } +// Work around DAG legality rules only based on the result type. +SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND; + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); + EVT SrcVT = Src.getValueType(); + + if (SrcVT.getScalarType() != MVT::bf16) + return Op; + + SDLoc SL(Op); + SDValue BitCast = + DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src); + + EVT DstVT = Op.getValueType(); + if (IsStrict) + llvm_unreachable("Need STRICT_BF16_TO_FP"); + + return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast); +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch<Register>(RegName) @@ -4825,6 +4931,48 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; } + case AMDGPU::GET_SHADERCYCLESHILO: { + assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + // The algorithm is: + // + // hi1 = getreg(SHADER_CYCLES_HI) + // lo1 = getreg(SHADER_CYCLES_LO) + // hi2 = getreg(SHADER_CYCLES_HI) + // + // If hi1 == hi2 then there was no overflow and the result is hi2:lo1. + // Otherwise there was overflow and the result is hi2:0. In both cases the + // result should represent the actual time at some point during the sequence + // of three getregs. + Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) + .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI, + 0, 32)); + Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) + .addImm( + AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES, 0, 32)); + Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) + .addImm(AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_SHADER_CYCLES_HI, + 0, 32)); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(RegHi1) + .addReg(RegHi2); + Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo) + .addReg(RegLo1) + .addImm(0); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE)) + .add(MI.getOperand(0)) + .addReg(RegLo) + .addImm(AMDGPU::sub0) + .addReg(RegHi2) + .addImm(AMDGPU::sub1); + MI.eraseFromParent(); + return BB; + } case AMDGPU::SI_INDIRECT_SRC_V1: case AMDGPU::SI_INDIRECT_SRC_V2: case AMDGPU::SI_INDIRECT_SRC_V4: @@ -5305,7 +5453,9 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16); + VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 || + VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 || + VT == MVT::v32bf16); SDValue Lo0, Hi0; SDValue Op0 = Op.getOperand(0); @@ -5424,7 +5574,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL: case ISD::ADD: case ISD::SUB: - case ISD::MUL: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: @@ -5438,6 +5587,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SADDSAT: case ISD::SSUBSAT: return splitBinaryVectorOp(Op, DAG); + case ISD::MUL: + return lowerMUL(Op, DAG); case ISD::SMULO: case ISD::UMULO: return lowerXMULO(Op, DAG); @@ -5452,6 +5603,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerGET_ROUNDING(Op, DAG); case ISD::PREFETCH: return lowerPREFETCH(Op, DAG); + case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: + return lowerFP_EXTEND(Op, DAG); } return SDValue(); } @@ -6090,6 +6244,66 @@ SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp); } +// Custom lowering for vector multiplications and s_mul_u64. +SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + // Split vector operands. + if (VT.isVector()) + return splitBinaryVectorOp(Op, DAG); + + assert(VT == MVT::i64 && "The following code is a special for s_mul_u64"); + + // There are four ways to lower s_mul_u64: + // + // 1. If all the operands are uniform, then we lower it as it is. + // + // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit + // multiplications because there is not a vector equivalent of s_mul_u64. + // + // 3. If the cost model decides that it is more efficient to use vector + // registers, then we have to split s_mul_u64 in 32-bit multiplications. + // This happens in splitScalarSMULU64() in SIInstrInfo.cpp . + // + // 4. If the cost model decides to use vector registers and both of the + // operands are zero-extended/sign-extended from 32-bits, then we split the + // s_mul_u64 in two 32-bit multiplications. The problem is that it is not + // possible to check if the operands are zero-extended or sign-extended in + // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with + // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace + // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. + // If the cost model decides that we have to use vector registers, then + // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/ + // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model + // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/ + // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in + // SIInstrInfo.cpp . + + if (Op->isDivergent()) + return SDValue(); + + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 + // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to + // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. + KnownBits Op0KnownBits = DAG.computeKnownBits(Op0); + unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros(); + KnownBits Op1KnownBits = DAG.computeKnownBits(Op1); + unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros(); + SDLoc SL(Op); + if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32) + return SDValue( + DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0); + unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0); + unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1); + if (Op0SignBits >= 33 && Op1SignBits >= 33) + return SDValue( + DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0); + // If all the operands are uniform, then we lower s_mul_u64 as it is. + return Op; +} + SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc SL(Op); @@ -6424,7 +6638,7 @@ SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, EVT InsVT = Ins.getValueType(); EVT EltVT = VecVT.getVectorElementType(); unsigned InsNumElts = InsVT.getVectorNumElements(); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + unsigned IdxVal = Idx->getAsZExtVal(); SDLoc SL(Op); if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { @@ -6639,7 +6853,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); - if (ResultVT == MVT::f16) { + if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) { SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt); return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result); } @@ -6725,8 +6939,8 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SDLoc SL(Op); EVT VT = Op.getValueType(); - if (VT == MVT::v4i16 || VT == MVT::v4f16 || - VT == MVT::v8i16 || VT == MVT::v8f16) { + if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 || + VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) { EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), VT.getVectorNumElements() / 2); MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits()); @@ -6749,7 +6963,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } - if (VT == MVT::v16i16 || VT == MVT::v16f16) { + if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) { EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), VT.getVectorNumElements() / 4); MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); @@ -6770,7 +6984,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } - if (VT == MVT::v32i16 || VT == MVT::v32f16) { + if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) { EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), VT.getVectorNumElements() / 8); MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); @@ -6791,7 +7005,7 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } - assert(VT == MVT::v2f16 || VT == MVT::v2i16); + assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16); assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); SDValue Lo = Op.getOperand(0); @@ -6890,6 +7104,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, // Adjust alignment for that dynamic shared memory array. Function &F = DAG.getMachineFunction().getFunction(); MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV)); + MFI->setUsesDynamicLDS(true); return SDValue( DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0); } @@ -7453,7 +7668,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Ops.push_back(IsA16 ? True : False); if (!Subtarget->hasGFX90AInsts()) { Ops.push_back(TFE); //tfe - } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) { + } else if (TFE->getAsZExtVal()) { report_fatal_error("TFE is not supported on this GPU"); } if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) @@ -7590,7 +7805,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); - uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); + uint64_t InstOffset = Ops[5]->getAsZExtVal(); for (unsigned i = 0; i < NumLoads; ++i) { Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, @@ -14052,11 +14267,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, EVT VT = N->getValueType(0); // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) - if (VT == MVT::v2i16 || VT == MVT::v2f16) { + if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2f16) { SDLoc SL(N); SDValue Src = N->getOperand(0); EVT EltVT = Src.getValueType(); - if (EltVT == MVT::f16) + if (EltVT != MVT::i16) Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src); SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 5bc091d6e84d..92b38ebade62 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -146,6 +146,7 @@ private: SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; @@ -417,6 +418,7 @@ public: SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 55ddb540c51e..1cb1d32707f2 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1424,6 +1424,12 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat( }); } +static bool isCacheInvOrWBInst(MachineInstr &Inst) { + auto Opc = Inst.getOpcode(); + return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB || + Opc == AMDGPU::GLOBAL_WBINV; +} + void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets) { // Now look at the instruction opcode. If it is a memory access @@ -1439,6 +1445,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst); } } else if (TII->isFLAT(Inst)) { + // TODO: Track this properly. + if (isCacheInvOrWBInst(Inst)) + return; + assert(Inst.mayLoadOrStore()); int FlatASCount = 0; diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 585a3eb78618..1b66d163714f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -91,7 +91,7 @@ class InstSI <dag outs, dag ins, string asm = "", field bit VOP3_OPSEL = 0; // Is it possible for this instruction to be atomic? - field bit maybeAtomic = 0; + field bit maybeAtomic = 1; // This bit indicates that this is a VI instruction which is renamed // in GFX9. Required for correct mapping from pseudo to MC. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 396d22c7ec18..fee900b3efb2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -338,8 +338,8 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) return false; - Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); - Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); + Offset0 = Off0->getAsZExtVal(); + Offset1 = Off1->getAsZExtVal(); return true; } @@ -2475,6 +2475,11 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + + case AMDGPU::S_MUL_U64_U32_PSEUDO: + case AMDGPU::S_MUL_I64_I32_PSEUDO: + MI.setDesc(get(AMDGPU::S_MUL_U64)); + break; } return true; } @@ -4153,15 +4158,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - return (isInt<16>(Imm) || isUInt<16>(Imm)) && - AMDGPU::isInlinableIntLiteral((int16_t)Imm); + return AMDGPU::isInlinableLiteralV2I16(Imm); + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return AMDGPU::isInlinableLiteralV2F16(Imm); case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -6845,6 +6850,21 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // Default handling break; } + + case AMDGPU::S_MUL_U64: + // Split s_mul_u64 in 32-bit vector multiplications. + splitScalarSMulU64(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + + case AMDGPU::S_MUL_U64_U32_PSEUDO: + case AMDGPU::S_MUL_I64_I32_PSEUDO: + // This is a special case of s_mul_u64 where all the operands are either + // zero extended or sign extended. + splitScalarSMulPseudo(Worklist, Inst, MDT); + Inst.eraseFromParent(); + return; + case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT); Inst.eraseFromParent(); @@ -7654,6 +7674,180 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } +// There is not a vector equivalent of s_mul_u64. For this reason, we need to +// split the s_mul_u64 in 32-bit vector multiplications. +void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); + const TargetRegisterClass *Src0SubRC = + RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src0SubRC)) + Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); + const TargetRegisterClass *Src1SubRC = + RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src1SubRC)) + Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + + // First, we extract the low 32-bit and high 32-bit values from each of the + // operands. + MachineOperand Op0L = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Op1L = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + MachineOperand Op0H = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand Op1H = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + + // The multilication is done as follows: + // + // Op1H Op1L + // * Op0H Op0L + // -------------------- + // Op1H*Op0L Op1L*Op0L + // + Op1H*Op0H Op1L*Op0H + // ----------------------------------------- + // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L + // + // We drop Op1H*Op0H because the result of the multiplication is a 64-bit + // value and that would overflow. + // The low 32-bit value is Op1L*Op0L. + // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L). + + Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Op1L_Op0H = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg) + .add(Op1L) + .add(Op0H); + + Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Op1H_Op0L = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg) + .add(Op1H) + .add(Op0L); + + Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Carry = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg) + .add(Op1L) + .add(Op0L); + + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) + .add(Op1L) + .add(Op0L); + + Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg) + .addReg(Op1L_Op0H_Reg) + .addReg(Op1H_Op0L_Reg); + + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1) + .addReg(AddReg) + .addReg(CarryReg); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(*Op1L_Op0H, MDT); + legalizeOperands(*Op1H_Op0L, MDT); + legalizeOperands(*Carry, MDT); + legalizeOperands(*LoHalf, MDT); + legalizeOperands(*Add, MDT); + legalizeOperands(*HiHalf, MDT); + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + +// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector +// multiplications. +void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist, + MachineInstr &Inst, + MachineDominatorTree *MDT) const { + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + MachineBasicBlock::iterator MII = Inst; + + const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg()); + const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg()); + const TargetRegisterClass *Src0SubRC = + RI.getSubRegisterClass(Src0RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src0SubRC)) + Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC); + const TargetRegisterClass *Src1SubRC = + RI.getSubRegisterClass(Src1RC, AMDGPU::sub0); + if (RI.isSGPRClass(Src1SubRC)) + Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC); + + // First, we extract the low 32-bit and high 32-bit values from each of the + // operands. + MachineOperand Op0L = + buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Op1L = + buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + unsigned Opc = Inst.getOpcode(); + unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO + ? AMDGPU::V_MUL_HI_U32_e64 + : AMDGPU::V_MUL_HI_I32_e64; + MachineInstr *HiHalf = + BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L); + + MachineInstr *LoHalf = + BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0) + .add(Op1L) + .add(Op0L); + + BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + + MRI.replaceRegWith(Dest.getReg(), FullDestReg); + + // Try to legalize the operands in case we need to swap the order to keep it + // valid. + legalizeOperands(*HiHalf, MDT); + legalizeOperands(*LoHalf, MDT); + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode, MachineDominatorTree *MDT) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 46eee6fae0a5..37ee159362a2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -138,6 +138,12 @@ private: unsigned Opcode, MachineDominatorTree *MDT = nullptr) const; + void splitScalarSMulU64(SIInstrWorklist &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const; + + void splitScalarSMulPseudo(SIInstrWorklist &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const; + void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineDominatorTree *MDT = nullptr) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 173c877b8d29..f07b8fa0ea4c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -860,23 +860,6 @@ def ShiftAmt32Imm : ImmLeaf <i32, [{ return Imm < 32; }]>; -def getNegV2I16Imm : SDNodeXForm<build_vector, [{ - return SDValue(packNegConstantV2I16(N, *CurDAG), 0); -}]>; - -def NegSubInlineConstV216 : PatLeaf<(build_vector), [{ - assert(N->getNumOperands() == 2); - assert(N->getOperand(0).getValueType().getSizeInBits() == 16); - SDValue Src0 = N->getOperand(0); - SDValue Src1 = N->getOperand(1); - if (Src0 == Src1) - return isNegInlineImmediate(Src0.getNode()); - - return (isNullConstantOrUndef(Src0) && isNegInlineImmediate(Src1.getNode())) || - (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); -}], getNegV2I16Imm>; - - def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{ return fp16SrcZerosHighBits(N->getOpcode()); }]>; @@ -1144,6 +1127,8 @@ def exp_tgt : CustomOperand<i32, 0, "ExpTgt">; def wait_vdst : NamedIntOperand<i8, "wait_vdst", "WaitVDST">; def wait_exp : NamedIntOperand<i8, "wait_exp", "WaitEXP">; +def wait_va_vdst : NamedIntOperand<i8, "wait_va_vdst", "WaitVAVDst">; +def wait_va_vsrc : NamedIntOperand<i8, "wait_vm_vsrc", "WaitVMVSrc">; class KImmFPOperand<ValueType vt> : ImmOperand<vt> { let OperandNamespace = "AMDGPU"; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 8310c6b57dad..b4bd46d33c1f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -30,7 +30,7 @@ include "SMInstructions.td" include "FLATInstructions.td" include "BUFInstructions.td" include "EXPInstructions.td" -include "LDSDIRInstructions.td" +include "DSDIRInstructions.td" include "VINTERPInstructions.td" //===----------------------------------------------------------------------===// @@ -111,7 +111,6 @@ def ATOMIC_FENCE : SPseudoInstSI< [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], "ATOMIC_FENCE $ordering, $scope"> { let hasSideEffects = 1; - let maybeAtomic = 1; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { @@ -316,6 +315,12 @@ def S_USUBO_PSEUDO : SPseudoInstSI < (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) >; +let OtherPredicates = [HasShaderCyclesHiLoRegisters] in +def GET_SHADERCYCLESHILO : SPseudoInstSI< + (outs SReg_64:$sdst), (ins), + [(set SReg_64:$sdst, (i64 (readcyclecounter)))] +>; + } // End usesCustomInserter = 1, Defs = [SCC] let usesCustomInserter = 1 in { @@ -557,6 +562,7 @@ def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), let hasNoSchedulingInfo = 1; let FixedSize = 1; let isMeta = 1; + let maybeAtomic = 0; } // Used as an isel pseudo to directly emit initialization with an @@ -1097,7 +1103,7 @@ def : Pat < multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> { // f16_to_fp patterns def : GCNPat < - (f32 (f16_to_fp i32:$src0)), + (f32 (any_f16_to_fp i32:$src0)), (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0) >; @@ -1122,7 +1128,7 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16 >; def : GCNPat < - (f64 (fpextend f16:$src)), + (f64 (any_fpextend f16:$src)), (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) >; @@ -1151,6 +1157,13 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16 (f16 (uint_to_fp i32:$src)), (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src)) >; + + // This is only used on targets without half support + // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering + def : GCNPat < + (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) + >; } let SubtargetPredicate = NotHasTrue16BitInsts in @@ -1515,6 +1528,23 @@ def : BitConvert <v2f16, f32, SReg_32>; def : BitConvert <f32, v2f16, SReg_32>; def : BitConvert <v2i16, f32, SReg_32>; def : BitConvert <f32, v2i16, SReg_32>; +def : BitConvert <v2bf16, i32, SReg_32>; +def : BitConvert <i32, v2bf16, SReg_32>; +def : BitConvert <v2bf16, i32, VGPR_32>; +def : BitConvert <i32, v2bf16, VGPR_32>; +def : BitConvert <v2bf16, v2i16, SReg_32>; +def : BitConvert <v2i16, v2bf16, SReg_32>; +def : BitConvert <v2bf16, v2i16, VGPR_32>; +def : BitConvert <v2i16, v2bf16, VGPR_32>; +def : BitConvert <v2bf16, v2f16, SReg_32>; +def : BitConvert <v2f16, v2bf16, SReg_32>; +def : BitConvert <v2bf16, v2f16, VGPR_32>; +def : BitConvert <v2f16, v2bf16, VGPR_32>; +def : BitConvert <f32, v2bf16, VGPR_32>; +def : BitConvert <v2bf16, f32, VGPR_32>; +def : BitConvert <f32, v2bf16, SReg_32>; +def : BitConvert <v2bf16, f32, SReg_32>; + // 64-bit bitcast def : BitConvert <i64, f64, VReg_64>; @@ -1531,6 +1561,19 @@ def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v2i32, f64, VReg_64>; def : BitConvert <v4i16, v4f16, VReg_64>; def : BitConvert <v4f16, v4i16, VReg_64>; +def : BitConvert <v4bf16, v2i32, VReg_64>; +def : BitConvert <v2i32, v4bf16, VReg_64>; +def : BitConvert <v4bf16, i64, VReg_64>; +def : BitConvert <i64, v4bf16, VReg_64>; +def : BitConvert <v4bf16, v4i16, VReg_64>; +def : BitConvert <v4i16, v4bf16, VReg_64>; +def : BitConvert <v4bf16, v4f16, VReg_64>; +def : BitConvert <v4f16, v4bf16, VReg_64>; +def : BitConvert <v4bf16, v2f32, VReg_64>; +def : BitConvert <v2f32, v4bf16, VReg_64>; +def : BitConvert <v4bf16, f64, VReg_64>; +def : BitConvert <f64, v4bf16, VReg_64>; + // FIXME: Make SGPR def : BitConvert <v2i32, v4f16, VReg_64>; @@ -1590,6 +1633,37 @@ def : BitConvert <v2f64, v8i16, SReg_128>; def : BitConvert <v2i64, v8f16, SReg_128>; def : BitConvert <v2f64, v8f16, SReg_128>; +def : BitConvert <v4i32, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v4i32, SReg_128>; +def : BitConvert <v4i32, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v4i32, VReg_128>; + +def : BitConvert <v4f32, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v4f32, SReg_128>; +def : BitConvert <v4f32, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v4f32, VReg_128>; + +def : BitConvert <v8i16, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v8i16, SReg_128>; +def : BitConvert <v8i16, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v8i16, VReg_128>; + +def : BitConvert <v8f16, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v8f16, SReg_128>; +def : BitConvert <v8f16, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v8f16, VReg_128>; + +def : BitConvert <v2f64, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v2f64, SReg_128>; +def : BitConvert <v2f64, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v2f64, VReg_128>; + +def : BitConvert <v2i64, v8bf16, SReg_128>; +def : BitConvert <v8bf16, v2i64, SReg_128>; +def : BitConvert <v2i64, v8bf16, VReg_128>; +def : BitConvert <v8bf16, v2i64, VReg_128>; + + // 160-bit bitcast def : BitConvert <v5i32, v5f32, SReg_160>; def : BitConvert <v5f32, v5i32, SReg_160>; @@ -1654,6 +1728,31 @@ def : BitConvert <v4i64, v16i16, VReg_256>; def : BitConvert <v4f64, v16f16, VReg_256>; def : BitConvert <v4f64, v16i16, VReg_256>; + +def : BitConvert <v8i32, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v8i32, VReg_256>; +def : BitConvert <v8f32, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v8f32, VReg_256>; +def : BitConvert <v4i64, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v4i64, VReg_256>; +def : BitConvert <v4f64, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v4f64, VReg_256>; + + + +def : BitConvert <v16i16, v16bf16, SReg_256>; +def : BitConvert <v16bf16, v16i16, SReg_256>; +def : BitConvert <v16i16, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v16i16, VReg_256>; + +def : BitConvert <v16f16, v16bf16, SReg_256>; +def : BitConvert <v16bf16, v16f16, SReg_256>; +def : BitConvert <v16f16, v16bf16, VReg_256>; +def : BitConvert <v16bf16, v16f16, VReg_256>; + + + + // 288-bit bitcast def : BitConvert <v9i32, v9f32, SReg_288>; def : BitConvert <v9f32, v9i32, SReg_288>; @@ -1702,6 +1801,38 @@ def : BitConvert <v8f64, v16f32, VReg_512>; def : BitConvert <v16f32, v8i64, VReg_512>; def : BitConvert <v16f32, v8f64, VReg_512>; + + +def : BitConvert <v32bf16, v32i16, VReg_512>; +def : BitConvert <v32i16, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v32i16, SReg_512>; +def : BitConvert <v32i16, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v32f16, VReg_512>; +def : BitConvert <v32f16, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v32f16, SReg_512>; +def : BitConvert <v32f16, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v16i32, VReg_512>; +def : BitConvert <v16i32, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v16i32, SReg_512>; +def : BitConvert <v16i32, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v16f32, VReg_512>; +def : BitConvert <v16f32, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v16f32, SReg_512>; +def : BitConvert <v16f32, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v8f64, VReg_512>; +def : BitConvert <v8f64, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v8f64, SReg_512>; +def : BitConvert <v8f64, v32bf16, SReg_512>; + +def : BitConvert <v32bf16, v8i64, VReg_512>; +def : BitConvert <v8i64, v32bf16, VReg_512>; +def : BitConvert <v32bf16, v8i64, SReg_512>; +def : BitConvert <v8i64, v32bf16, SReg_512>; + // 1024-bit bitcast def : BitConvert <v32i32, v32f32, VReg_1024>; def : BitConvert <v32f32, v32i32, VReg_1024>; @@ -1958,19 +2089,21 @@ def : GCNPat < let SubtargetPredicate = HasPackedFP32Ops; } +foreach fp16vt = [f16, bf16] in { + def : GCNPat < - (fcopysign f16:$src0, f16:$src1), + (fcopysign fp16vt:$src0, fp16vt:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) >; def : GCNPat < - (fcopysign f32:$src0, f16:$src1), + (fcopysign f32:$src0, fp16vt:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, (V_LSHLREV_B32_e64 (i32 16), $src1)) >; def : GCNPat < - (fcopysign f64:$src0, f16:$src1), + (fcopysign f64:$src0, fp16vt:$src1), (REG_SEQUENCE SReg_64, (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), @@ -1978,16 +2111,17 @@ def : GCNPat < >; def : GCNPat < - (fcopysign f16:$src0, f32:$src1), + (fcopysign fp16vt:$src0, f32:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), $src1)) >; def : GCNPat < - (fcopysign f16:$src0, f64:$src1), + (fcopysign fp16vt:$src0, f64:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) >; +} // End foreach fp16vt = [f16, bf16] /********** ================== **********/ /********** Immediate Patterns **********/ @@ -2026,6 +2160,11 @@ def : GCNPat < (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) >; +def : GCNPat < + (VGPRImm<(bf16 fpimm)>:$imm), + (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm))) +>; + // V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit // immediate and wil be expanded as needed, but we will only use these patterns // for values which can be encoded. @@ -2060,6 +2199,11 @@ def : GCNPat < >; def : GCNPat < + (bf16 fpimm:$imm), + (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) +>; + +def : GCNPat < (p5 frameindex:$fi), (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi))) >; @@ -3741,6 +3885,18 @@ def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { let mayStore = 0; } +def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + +def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + // This is equivalent to the G_INTRINSIC*, but the operands may have // been legalized depending on the subtarget requirements. def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 10ec54d3317f..6d749ad1ad24 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -578,6 +578,14 @@ public: bool IsNonTemporal) const override; }; +class SIGfx12CacheControl : public SIGfx11CacheControl { +public: + SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} + + bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, Position Pos) const override; +}; + class SIMemoryLegalizer final : public MachineFunctionPass { private: @@ -857,7 +865,9 @@ std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) { return std::make_unique<SIGfx7CacheControl>(ST); if (Generation < AMDGPUSubtarget::GFX11) return std::make_unique<SIGfx10CacheControl>(ST); - return std::make_unique<SIGfx11CacheControl>(ST); + if (Generation < AMDGPUSubtarget::GFX12) + return std::make_unique<SIGfx11CacheControl>(ST); + return std::make_unique<SIGfx12CacheControl>(ST); } bool SIGfx6CacheControl::enableLoadCacheBypass( @@ -1423,7 +1433,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, bool Changed = false; MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + const DebugLoc &DL = MI->getDebugLoc(); if (Pos == Position::AFTER) ++MI; @@ -2132,6 +2142,62 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal( return Changed; } +bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, + SIAtomicScope Scope, + SIAtomicAddrSpace AddrSpace, + Position Pos) const { + if (!InsertCacheInv) + return false; + + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + /// The scratch address space does not need the global memory cache + /// to be flushed as all memory operations by the same thread are + /// sequentially consistent, and no other thread can access scratch + /// memory. + + /// Other address spaces do not have a cache. + if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) + return false; + + AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; + switch (Scope) { + case SIAtomicScope::SYSTEM: + ScopeImm = AMDGPU::CPol::SCOPE_SYS; + break; + case SIAtomicScope::AGENT: + ScopeImm = AMDGPU::CPol::SCOPE_DEV; + break; + case SIAtomicScope::WORKGROUP: + // In WGP mode the waves of a work-group can be executing on either CU of + // the WGP. Therefore we need to invalidate the L0 which is per CU. + // Otherwise in CU mode all waves of a work-group are on the same CU, and so + // the L0 does not need to be invalidated. + if (ST.isCuModeEnabled()) + return false; + + ScopeImm = AMDGPU::CPol::SCOPE_SE; + break; + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No cache to invalidate. + return false; + default: + llvm_unreachable("Unsupported synchronization scope"); + } + + if (Pos == Position::AFTER) + ++MI; + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm); + + if (Pos == Position::AFTER) + --MI; + + return true; +} + bool SIMemoryLegalizer::removeAtomicPseudoMIs() { if (AtomicPseudoMIs.empty()) return false; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index c94b894c5841..f42af89cf5e6 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -916,7 +916,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>; defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>; let GlobalPriority = true in { -defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>; +defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16], SGPR_512Regs, TTMP_512Regs>; defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>; } @@ -970,7 +970,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>; defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>; let GlobalPriority = true in { -defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>; +defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16, v32bf16], (add VGPR_512)>; defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>; } @@ -1152,11 +1152,11 @@ class RegOrF32 <string RegisterClass, string OperandTypePrefix> class RegOrV2B16 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16", - !subst("_v2b16", "V2B16", NAME), "_Imm16">; + !subst("_v2b16", "V2B16", NAME), "_ImmV2I16">; class RegOrV2F16 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16", - !subst("_v2f16", "V2F16", NAME), "_Imm16">; + !subst("_v2f16", "V2F16", NAME), "_ImmV2F16">; class RegOrF64 <string RegisterClass, string OperandTypePrefix> : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64", diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 59d6ccf513bb..5e6c34992930 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -553,7 +553,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, } continue; } else if (Opcode == AMDGPU::LDS_PARAM_LOAD || - Opcode == AMDGPU::LDS_DIRECT_LOAD) { + Opcode == AMDGPU::DS_PARAM_LOAD || + Opcode == AMDGPU::LDS_DIRECT_LOAD || + Opcode == AMDGPU::DS_DIRECT_LOAD) { // Mark these STRICTWQM, but only for the instruction, not its operands. // This avoid unnecessarily marking M0 as requiring WQM. InstrInfo &II = Instructions[&MI]; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 3297847b0360..fc29ce8d71f2 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -29,6 +29,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt let mayStore = 0; let mayLoad = 1; let hasSideEffects = 0; + let maybeAtomic = 0; let UseNamedOperandTable = 1; let SchedRW = [WriteSMEM]; @@ -305,6 +306,10 @@ let SubtargetPredicate = HasScalarDwordx3Loads in defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>; defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_64, SReg_256>; defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>; +defm S_LOAD_I8 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_U8 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_I16 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; +defm S_LOAD_U16 : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>; let is_buffer = 1 in { defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; @@ -316,6 +321,10 @@ let SubtargetPredicate = HasScalarDwordx3Loads in defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>; defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>; defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>; +defm S_BUFFER_LOAD_I8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_U8 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_I16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; +defm S_BUFFER_LOAD_U16 : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>; } let SubtargetPredicate = HasScalarStores in { @@ -977,20 +986,35 @@ def : GCNPat < } } // let OtherPredicates = [HasShaderCyclesRegister] -multiclass SMPrefetchPat<string type, int cache_type> { +def i32imm_zero : TImmLeaf <i32, [{ + return Imm == 0; +}]>; + +def i32imm_one : TImmLeaf <i32, [{ + return Imm == 1; +}]>; + +multiclass SMPrefetchPat<string type, TImmLeaf cache_type> { def : GCNPat < - (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)), + (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, cache_type), (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0)) >; def : GCNPat < - (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)), + (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, cache_type), (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0)) >; + + def : GCNPat < + (smrd_prefetch (i32 SReg_32:$sbase), timm, timm, cache_type), + (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) + (i64 (REG_SEQUENCE SReg_64, $sbase, sub0, (i32 (S_MOV_B32 (i32 0))), sub1)), + 0, (i32 SGPR_NULL), (i8 0)) + >; } -defm : SMPrefetchPat<"INST", 0>; -defm : SMPrefetchPat<"DATA", 1>; +defm : SMPrefetchPat<"INST", i32imm_zero>; +defm : SMPrefetchPat<"DATA", i32imm_one>; //===----------------------------------------------------------------------===// // GFX10. @@ -1321,6 +1345,11 @@ defm S_LOAD_B128 : SM_Real_Loads_gfx12<0x02, "S_LOAD_DWORDX4">; defm S_LOAD_B256 : SM_Real_Loads_gfx12<0x03, "S_LOAD_DWORDX8">; defm S_LOAD_B512 : SM_Real_Loads_gfx12<0x04, "S_LOAD_DWORDX16">; +defm S_LOAD_I8 : SM_Real_Loads_gfx12<0x08>; +defm S_LOAD_U8 : SM_Real_Loads_gfx12<0x09>; +defm S_LOAD_I16 : SM_Real_Loads_gfx12<0x0a>; +defm S_LOAD_U16 : SM_Real_Loads_gfx12<0x0b>; + defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx12<0x10, "S_BUFFER_LOAD_DWORD">; defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx12<0x11, "S_BUFFER_LOAD_DWORDX2">; defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx12<0x15, "S_BUFFER_LOAD_DWORDX3">; @@ -1328,6 +1357,11 @@ defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx12<0x12, "S_BUFFER_LOAD_DWORDX4">; defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx12<0x13, "S_BUFFER_LOAD_DWORDX8">; defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx12<0x14, "S_BUFFER_LOAD_DWORDX16">; +defm S_BUFFER_LOAD_I8 : SM_Real_Loads_gfx12<0x18>; +defm S_BUFFER_LOAD_U8 : SM_Real_Loads_gfx12<0x19>; +defm S_BUFFER_LOAD_I16 : SM_Real_Loads_gfx12<0x1a>; +defm S_BUFFER_LOAD_U16 : SM_Real_Loads_gfx12<0x1b>; + def S_DCACHE_INV_gfx12 : SMEM_Real_gfx12<0x021, S_DCACHE_INV>; def S_PREFETCH_INST_gfx12 : SMEM_Real_Prefetch_gfx12<0x24, S_PREFETCH_INST>; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index c9687ac368d3..46fa3d57a21c 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -673,6 +673,16 @@ let SubtargetPredicate = isGFX12Plus in { let isCommutable = 1; } + // The higher 32-bits of the inputs contain the sign extension bits. + def S_MUL_I64_I32_PSEUDO : SPseudoInstSI < + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) + >; + + // The higher 32-bits of the inputs are zero. + def S_MUL_U64_U32_PSEUDO : SPseudoInstSI < + (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) + >; + } // End SubtargetPredicate = isGFX12Plus let Uses = [SCC] in { @@ -1186,14 +1196,12 @@ let SubtargetPredicate = isGFX10Plus in { let SubtargetPredicate = isGFX10GFX11 in { def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">; def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">; -} // End SubtargetPredicate = isGFX10GFX11 -let SubtargetPredicate = isGFX10Plus in { def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">; def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">; def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">; def S_WAITCNT_LGKMCNT : SOPK_WAITCNT<"s_waitcnt_lgkmcnt">; -} // End SubtargetPredicate = isGFX10Plus +} // End SubtargetPredicate = isGFX10GFX11 //===----------------------------------------------------------------------===// // SOPC Instructions @@ -1702,6 +1710,27 @@ let SubtargetPredicate = HasVGPRSingleUseHintInsts in { SOPP_Pseudo<"s_singleuse_vdst", (ins s16imm:$simm16), "$simm16">; } // End SubtargetPredicate = HasVGPRSingeUseHintInsts +let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in { + def S_WAIT_LOADCNT : + SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_LOADCNT_DSCNT : + SOPP_Pseudo<"s_wait_loadcnt_dscnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_STORECNT : + SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_STORECNT_DSCNT : + SOPP_Pseudo<"s_wait_storecnt_dscnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_SAMPLECNT : + SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_BVHCNT : + SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_EXPCNT : + SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_DSCNT : + SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16">; + def S_WAIT_KMCNT : + SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16">; +} // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 + //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// @@ -2411,10 +2440,10 @@ defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11_gfx12<0x013>; defm S_CALL_B64 : SOPK_Real32_gfx11_gfx12<0x014>; defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>; defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>; -defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11_gfx12<0x018>; -defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11_gfx12<0x019>; -defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11_gfx12<0x01a>; -defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11_gfx12<0x01b>; +defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>; +defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>; +defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>; +defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>; //===----------------------------------------------------------------------===// // SOPK - GFX10. @@ -2516,6 +2545,15 @@ multiclass SOPP_Real_32_Renamed_gfx12<bits<7> op, SOPP_Pseudo backing_pseudo, st defm S_WAIT_ALU : SOPP_Real_32_Renamed_gfx12<0x008, S_WAITCNT_DEPCTR, "s_wait_alu">; defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>; defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>; +defm S_WAIT_LOADCNT : SOPP_Real_32_gfx12<0x040>; +defm S_WAIT_STORECNT : SOPP_Real_32_gfx12<0x041>; +defm S_WAIT_SAMPLECNT : SOPP_Real_32_gfx12<0x042>; +defm S_WAIT_BVHCNT : SOPP_Real_32_gfx12<0x043>; +defm S_WAIT_EXPCNT : SOPP_Real_32_gfx12<0x044>; +defm S_WAIT_DSCNT : SOPP_Real_32_gfx12<0x046>; +defm S_WAIT_KMCNT : SOPP_Real_32_gfx12<0x047>; +defm S_WAIT_LOADCNT_DSCNT : SOPP_Real_32_gfx12<0x048>; +defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>; //===----------------------------------------------------------------------===// // SOPP - GFX11, GFX12. diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a91d77175234..26ba2575ff34 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2506,53 +2506,95 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { Val == 0x3118; // 1/2pi } -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) { - assert(HasInv2Pi); - - if (isInt<16>(Literal) || isUInt<16>(Literal)) { - int16_t Trunc = static_cast<int16_t>(Literal); - return AMDGPU::isInlinableLiteral16(Trunc, HasInv2Pi); +std::optional<unsigned> getInlineEncodingV216(bool IsFloat, uint32_t Literal) { + // Unfortunately, the Instruction Set Architecture Reference Guide is + // misleading about how the inline operands work for (packed) 16-bit + // instructions. In a nutshell, the actual HW behavior is: + // + // - integer encodings (-16 .. 64) are always produced as sign-extended + // 32-bit values + // - float encodings are produced as: + // - for F16 instructions: corresponding half-precision float values in + // the LSBs, 0 in the MSBs + // - for UI16 instructions: corresponding single-precision float value + int32_t Signed = static_cast<int32_t>(Literal); + if (Signed >= 0 && Signed <= 64) + return 128 + Signed; + + if (Signed >= -16 && Signed <= -1) + return 192 + std::abs(Signed); + + if (IsFloat) { + // clang-format off + switch (Literal) { + case 0x3800: return 240; // 0.5 + case 0xB800: return 241; // -0.5 + case 0x3C00: return 242; // 1.0 + case 0xBC00: return 243; // -1.0 + case 0x4000: return 244; // 2.0 + case 0xC000: return 245; // -2.0 + case 0x4400: return 246; // 4.0 + case 0xC400: return 247; // -4.0 + case 0x3118: return 248; // 1.0 / (2.0 * pi) + default: break; + } + // clang-format on + } else { + // clang-format off + switch (Literal) { + case 0x3F000000: return 240; // 0.5 + case 0xBF000000: return 241; // -0.5 + case 0x3F800000: return 242; // 1.0 + case 0xBF800000: return 243; // -1.0 + case 0x40000000: return 244; // 2.0 + case 0xC0000000: return 245; // -2.0 + case 0x40800000: return 246; // 4.0 + case 0xC0800000: return 247; // -4.0 + case 0x3E22F983: return 248; // 1.0 / (2.0 * pi) + default: break; + } + // clang-format on } - if (!(Literal & 0xffff)) - return AMDGPU::isInlinableLiteral16(Literal >> 16, HasInv2Pi); - int16_t Lo16 = static_cast<int16_t>(Literal); - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi); + return {}; } -bool isInlinableIntLiteralV216(int32_t Literal) { - int16_t Lo16 = static_cast<int16_t>(Literal); - if (isInt<16>(Literal) || isUInt<16>(Literal)) - return isInlinableIntLiteral(Lo16); +// Encoding of the literal as an inline constant for a V_PK_*_IU16 instruction +// or nullopt. +std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal) { + return getInlineEncodingV216(false, Literal); +} - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - if (!(Literal & 0xffff)) - return isInlinableIntLiteral(Hi16); - return Lo16 == Hi16 && isInlinableIntLiteral(Lo16); +// Encoding of the literal as an inline constant for a V_PK_*_F16 instruction +// or nullopt. +std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal) { + return getInlineEncodingV216(true, Literal); } -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) { +// Whether the given literal can be inlined for a V_PK_* instruction. +bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType) { switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + return getInlineEncodingV216(false, Literal).has_value(); case AMDGPU::OPERAND_REG_IMM_V2FP16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - return isInlinableLiteralV216(Literal, HasInv2Pi); + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: + return getInlineEncodingV216(true, Literal).has_value(); default: - return isInlinableIntLiteralV216(Literal); + llvm_unreachable("bad packed operand type"); } } -bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) { - assert(HasInv2Pi); - - int16_t Lo16 = static_cast<int16_t>(Literal); - if (isInt<16>(Literal) || isUInt<16>(Literal)) - return true; +// Whether the given literal can be inlined for a V_PK_*_IU16 instruction. +bool isInlinableLiteralV2I16(uint32_t Literal) { + return getInlineEncodingV2I16(Literal).has_value(); +} - int16_t Hi16 = static_cast<int16_t>(Literal >> 16); - if (!(Literal & 0xffff)) - return true; - return Lo16 == Hi16; +// Whether the given literal can be inlined for a V_PK_*_F16 instruction. +bool isInlinableLiteralV2F16(uint32_t Literal) { + return getInlineEncodingV2F16(Literal).has_value(); } bool isValid32BitLiteral(uint64_t Val, bool IsFP64) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 3c9f330cbcde..50c741760d71 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1291,16 +1291,19 @@ LLVM_READNONE bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi); +std::optional<unsigned> getInlineEncodingV2I16(uint32_t Literal); LLVM_READNONE -bool isInlinableIntLiteralV216(int32_t Literal); +std::optional<unsigned> getInlineEncodingV2F16(uint32_t Literal); LLVM_READNONE -bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType); +bool isInlinableLiteralV216(uint32_t Literal, uint8_t OpType); LLVM_READNONE -bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi); +bool isInlinableLiteralV2I16(uint32_t Literal); + +LLVM_READNONE +bool isInlinableLiteralV2F16(uint32_t Literal); LLVM_READNONE bool isValid32BitLiteral(uint64_t Val, bool IsFP64); diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 0aa62ea77b11..ecee61daa1c8 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1300,7 +1300,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps, let OtherPredicates = ps.OtherPredicates; } - + class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> : VOP2_DPP8<op, ps, p> { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 7f52501b5d90..e9d6f67aee16 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -125,15 +125,6 @@ defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3P_Profile<VOP_V2I16_V2 let SubtargetPredicate = HasVOP3PInsts in { -// Undo sub x, c -> add x, -c canonicalization since c is more likely -// an inline immediate than -c. -// The constant will be emitted as a mov, and folded later. -// TODO: We could directly encode the immediate now -def : GCNPat< - (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1), - (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) ->; - // Integer operations with clamp bit set. class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat< (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), @@ -632,12 +623,12 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node, // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { def _e64 : MAIInst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), - !if(NoDstOverlap, null_frag, AgprMAIFrag<node>)>, + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, AgprMAIFrag<node>)>, MFMATable<0, NAME # "_e64">; let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in def _vgprcd_e64 : MAIInst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), - !if(NoDstOverlap, null_frag, VgprMAIFrag<node>)>, + !if(!or(NoDstOverlap, !eq(node, null_frag)), null_frag, VgprMAIFrag<node>)>, MFMATable<0, NAME # "_vgprcd_e64">; } @@ -645,12 +636,13 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node, let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), isConvertibleToThreeAddress = NoDstOverlap, Mnemonic = OpName in { - def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), AgprMAIFrag<node>>, + def "_mac_e64" : MAIInst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), + !if(!eq(node, null_frag), null_frag, AgprMAIFrag<node>)>, MFMATable<1, NAME # "_e64">; let SubtargetPredicate = isGFX90APlus in def _mac_vgprcd_e64 : MAIInst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD"), - VgprMAIFrag<node>>, + !if(!eq(node, null_frag), null_frag, VgprMAIFrag<node>)>, MFMATable<1, NAME # "_vgprcd_e64">; } } diff --git a/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp b/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp index 28e35f8f2a54..17c2d7bb13b4 100644 --- a/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARC/ARCISelDAGToDAG.cpp @@ -170,7 +170,7 @@ bool ARCDAGToDAGISel::SelectFrameADDR_ri(SDValue Addr, SDValue &Base, void ARCDAGToDAGISel::Select(SDNode *N) { switch (N->getOpcode()) { case ISD::Constant: { - uint64_t CVal = cast<ConstantSDNode>(N)->getZExtValue(); + uint64_t CVal = N->getAsZExtVal(); ReplaceNode(N, CurDAG->getMachineNode( isInt<12>(CVal) ? ARC::MOV_rs12 : ARC::MOV_rlimm, SDLoc(N), MVT::i32, diff --git a/llvm/lib/Target/ARC/ARCISelLowering.cpp b/llvm/lib/Target/ARC/ARCISelLowering.cpp index 2265f5db6737..5dd343d97b80 100644 --- a/llvm/lib/Target/ARC/ARCISelLowering.cpp +++ b/llvm/lib/Target/ARC/ARCISelLowering.cpp @@ -174,6 +174,8 @@ ARCTargetLowering::ARCTargetLowering(const TargetMachine &TM, setOperationAction(ISD::READCYCLECOUNTER, MVT::i32, Legal); setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isTypeLegal(MVT::i64) ? Legal : Custom); + + setMaxAtomicSizeInBitsSupported(0); } const char *ARCTargetLowering::getTargetNodeName(unsigned Opcode) const { diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/llvm/lib/Target/ARC/ARCTargetMachine.cpp index d4ae3255b32a..4f612ae623b9 100644 --- a/llvm/lib/Target/ARC/ARCTargetMachine.cpp +++ b/llvm/lib/Target/ARC/ARCTargetMachine.cpp @@ -57,6 +57,7 @@ public: return getTM<ARCTargetMachine>(); } + void addIRPasses() override; bool addInstSelector() override; void addPreEmitPass() override; void addPreRegAlloc() override; @@ -68,6 +69,12 @@ TargetPassConfig *ARCTargetMachine::createPassConfig(PassManagerBase &PM) { return new ARCPassConfig(*this, PM); } +void ARCPassConfig::addIRPasses() { + addPass(createAtomicExpandPass()); + + TargetPassConfig::addIRPasses(); +} + bool ARCPassConfig::addInstSelector() { addPass(createARCISelDag(getARCTargetMachine(), getOptLevel())); return false; diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 1d6aaeb7433b..cb3a709f7003 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -747,7 +747,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) { unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); TmpOffset += SL->getElementOffset(Idx); } else { - uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + uint64_t S = GTI.getSequentialElementStride(DL); while (true) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index adc429b61bbc..e99ee299412a 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -372,7 +372,7 @@ INITIALIZE_PASS(ARMDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false) /// operand. If so Imm will receive the 32-bit value. static bool isInt32Immediate(SDNode *N, unsigned &Imm) { if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { - Imm = cast<ConstantSDNode>(N)->getZExtValue(); + Imm = N->getAsZExtVal(); return true; } return false; @@ -1101,8 +1101,7 @@ bool ARMDAGToDAGISel::SelectAddrModePC(SDValue N, if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) { Offset = N.getOperand(0); SDValue N1 = N.getOperand(1); - Label = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(), - SDLoc(N), MVT::i32); + Label = CurDAG->getTargetConstant(N1->getAsZExtVal(), SDLoc(N), MVT::i32); return true; } @@ -1942,7 +1941,7 @@ SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl, if (!is64BitVector && NumVecs < 3) NumRegs *= 2; - unsigned Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + unsigned Alignment = Align->getAsZExtVal(); if (Alignment >= 32 && NumRegs == 4) Alignment = 32; else if (Alignment >= 16 && (NumRegs == 2 || NumRegs == 4)) @@ -2428,7 +2427,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating, unsigned Alignment = 0; if (NumVecs != 3) { - Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + Alignment = Align->getAsZExtVal(); unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8; if (Alignment > NumBytes) Alignment = NumBytes; @@ -2871,7 +2870,7 @@ void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, Ops.push_back(N->getOperand(OpIdx++)); // limit SDValue ImmOp = N->getOperand(OpIdx++); // step - int ImmValue = cast<ConstantSDNode>(ImmOp)->getZExtValue(); + int ImmValue = ImmOp->getAsZExtVal(); Ops.push_back(getI32Imm(ImmValue, Loc)); if (Predicated) @@ -2892,7 +2891,7 @@ void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode, // Convert and append the immediate operand designating the coprocessor. SDValue ImmCorpoc = N->getOperand(OpIdx++); - uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue(); + uint32_t ImmCoprocVal = ImmCorpoc->getAsZExtVal(); Ops.push_back(getI32Imm(ImmCoprocVal, Loc)); // For accumulating variants copy the low and high order parts of the @@ -2911,7 +2910,7 @@ void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode, // Convert and append the immediate operand SDValue Imm = N->getOperand(OpIdx); - uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue(); + uint32_t ImmVal = Imm->getAsZExtVal(); Ops.push_back(getI32Imm(ImmVal, Loc)); // Accumulating variants are IT-predicable, add predicate operands. @@ -2965,7 +2964,7 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, unsigned Alignment = 0; if (NumVecs != 3) { - Alignment = cast<ConstantSDNode>(Align)->getZExtValue(); + Alignment = Align->getAsZExtVal(); unsigned NumBytes = NumVecs * VT.getScalarSizeInBits() / 8; if (Alignment > NumBytes) Alignment = NumBytes; @@ -3697,7 +3696,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { // Other cases are autogenerated. break; case ISD::Constant: { - unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); + unsigned Val = N->getAsZExtVal(); // If we can't materialize the constant we need to use a literal pool if (ConstantMaterializationCost(Val, Subtarget) > 2 && !Subtarget->genExecuteOnly()) { @@ -4132,7 +4131,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { assert(N2.getOpcode() == ISD::Constant); assert(N3.getOpcode() == ISD::Register); - unsigned CC = (unsigned) cast<ConstantSDNode>(N2)->getZExtValue(); + unsigned CC = (unsigned)N2->getAsZExtVal(); if (InGlue.getOpcode() == ARMISD::CMPZ) { if (InGlue.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN) { @@ -4243,8 +4242,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) { if (SwitchEQNEToPLMI) { SDValue ARMcc = N->getOperand(2); - ARMCC::CondCodes CC = - (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); + ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal(); switch (CC) { default: llvm_unreachable("CMPZ must be either NE or EQ!"); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9f3bcffc7a99..568085bd0ab3 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -4820,8 +4820,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, // some tweaks to the heuristics for the previous and->shift transform. // FIXME: Optimize cases where the LHS isn't a shift. if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && - isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && + isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U && CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && LHS.getConstantOperandVal(1) < 31) { unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1; @@ -5533,7 +5532,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); // Choose GE over PL, which vsel does now support - if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) + if (ARMcc->getAsZExtVal() == ARMCC::PL) ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); } @@ -7749,7 +7748,7 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, uint64_t Val; if (!isa<ConstantSDNode>(N)) return SDValue(); - Val = cast<ConstantSDNode>(N)->getZExtValue(); + Val = N->getAsZExtVal(); if (ST->isThumb1Only()) { if (Val <= 255 || ~Val <= 255) @@ -7804,7 +7803,7 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, SDValue V = Op.getOperand(i); if (!isa<ConstantSDNode>(V) && !V.isUndef()) continue; - bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); + bool BitSet = V.isUndef() ? false : V->getAsZExtVal(); if (BitSet) Bits32 |= BoolMask << (i * BitsPerBool); } @@ -9240,7 +9239,7 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, EVT VT = Op.getValueType(); EVT Op1VT = V1.getValueType(); unsigned NumElts = VT.getVectorNumElements(); - unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); + unsigned Index = V2->getAsZExtVal(); assert(VT.getScalarSizeInBits() == 1 && "Unexpected custom EXTRACT_SUBVECTOR lowering"); @@ -14618,7 +14617,7 @@ static SDValue PerformORCombineToBFI(SDNode *N, // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask // where lsb(mask) == #shamt and masked bits of B are known zero. SDValue ShAmt = N00.getOperand(1); - unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); + unsigned ShAmtC = ShAmt->getAsZExtVal(); unsigned LSB = llvm::countr_zero(Mask); if (ShAmtC != LSB) return SDValue(); @@ -18339,8 +18338,7 @@ ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { SDValue Chain = N->getOperand(0); SDValue BB = N->getOperand(1); SDValue ARMcc = N->getOperand(2); - ARMCC::CondCodes CC = - (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); + ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal(); // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) // -> (brcond Chain BB CC CPSR Cmp) @@ -18373,8 +18371,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { SDValue FalseVal = N->getOperand(0); SDValue TrueVal = N->getOperand(1); SDValue ARMcc = N->getOperand(2); - ARMCC::CondCodes CC = - (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); + ARMCC::CondCodes CC = (ARMCC::CondCodes)ARMcc->getAsZExtVal(); // BFI is only available on V6T2+. if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp index 3ffde86ce1bb..abea0fef5cdc 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -362,8 +362,8 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate, llvm_unreachable("Unsupported size for FCmp predicate"); } -bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, - MachineInstr &MI) const { +bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { using namespace TargetOpcode; MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; @@ -392,7 +392,8 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, OriginalResult}; auto Status = createLibcall(MIRBuilder, Libcall, {RetRegs, RetTy, 0}, {{MI.getOperand(1).getReg(), ArgTy, 0}, - {MI.getOperand(2).getReg(), ArgTy, 0}}); + {MI.getOperand(2).getReg(), ArgTy, 0}}, + LocObserver, &MI); if (Status != LegalizerHelper::Legalized) return false; break; @@ -428,7 +429,8 @@ bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, auto Status = createLibcall(MIRBuilder, Libcall.LibcallID, {LibcallResult, RetTy, 0}, {{MI.getOperand(2).getReg(), ArgTy, 0}, - {MI.getOperand(3).getReg(), ArgTy, 0}}); + {MI.getOperand(3).getReg(), ArgTy, 0}}, + LocObserver, &MI); if (Status != LegalizerHelper::Legalized) return false; diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/llvm/lib/Target/ARM/ARMLegalizerInfo.h index f1c2e9c94336..d6ce4eb1055b 100644 --- a/llvm/lib/Target/ARM/ARMLegalizerInfo.h +++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.h @@ -23,12 +23,12 @@ namespace llvm { class ARMSubtarget; -/// This class provides the information for the target register banks. class ARMLegalizerInfo : public LegalizerInfo { public: ARMLegalizerInfo(const ARMSubtarget &ST); - bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const override; private: void setFCmpLibcallsGNU(); diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp index d36bfb188ed3..f91e77adb8f8 100644 --- a/llvm/lib/Target/AVR/AVRISelLowering.cpp +++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp @@ -660,7 +660,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, SDValue Cmp; if (LHS.getSimpleValueType() == MVT::i16 && isa<ConstantSDNode>(RHS)) { - uint64_t Imm = cast<ConstantSDNode>(RHS)->getZExtValue(); + uint64_t Imm = RHS->getAsZExtVal(); // Generate a CPI/CPC pair if RHS is a 16-bit constant. Use the zero // register for the constant RHS if its lower or higher byte is zero. SDValue LHSlo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS, @@ -680,7 +680,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, } else if (RHS.getSimpleValueType() == MVT::i16 && isa<ConstantSDNode>(LHS)) { // Generate a CPI/CPC pair if LHS is a 16-bit constant. Use the zero // register for the constant LHS if its lower or higher byte is zero. - uint64_t Imm = cast<ConstantSDNode>(LHS)->getZExtValue(); + uint64_t Imm = LHS->getAsZExtVal(); SDValue LHSlo = (Imm & 0xff) == 0 ? DAG.getRegister(Subtarget.getZeroRegister(), MVT::i8) : DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i8, LHS, diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp index 2fe86e75ddae..4d8ace7c1ece 100644 --- a/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -151,6 +151,7 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM, } setBooleanContents(ZeroOrOneBooleanContent); + setMaxAtomicSizeInBitsSupported(64); // Function alignments setMinFunctionAlignment(Align(8)); diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp index ab0db576f7f7..8a6e7ae3663e 100644 --- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp +++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp @@ -108,7 +108,8 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) { return new BPFPassConfig(*this, PM); } -void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void BPFTargetMachine::registerPassBuilderCallbacks( + PassBuilder &PB, bool PopulateClassToPassNames) { PB.registerPipelineParsingCallback( [](StringRef PassName, FunctionPassManager &FPM, ArrayRef<PassBuilder::PipelineElement>) { @@ -148,7 +149,9 @@ void BPFTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { } void BPFPassConfig::addIRPasses() { + addPass(createAtomicExpandPass()); addPass(createBPFCheckAndAdjustIR()); + TargetPassConfig::addIRPasses(); } diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.h b/llvm/lib/Target/BPF/BPFTargetMachine.h index 4e6adc722e76..0a28394463b2 100644 --- a/llvm/lib/Target/BPF/BPFTargetMachine.h +++ b/llvm/lib/Target/BPF/BPFTargetMachine.h @@ -42,7 +42,8 @@ public: return TLOF.get(); } - void registerPassBuilderCallbacks(PassBuilder &PB) override; + void registerPassBuilderCallbacks(PassBuilder &PB, + bool PopulateClassToPassNames) override; }; } diff --git a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h index 8ffa1d7cd9b3..bce41160b95e 100644 --- a/llvm/lib/Target/DirectX/DXILResourceAnalysis.h +++ b/llvm/lib/Target/DirectX/DXILResourceAnalysis.h @@ -36,6 +36,7 @@ class DXILResourcePrinterPass : public PassInfoMixin<DXILResourcePrinterPass> { public: explicit DXILResourcePrinterPass(raw_ostream &OS) : OS(OS) {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); + static bool isRequired() { return true; } }; /// The legacy pass manager's analysis pass to compute DXIL resource diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index d5cb488f2fde..06938f8c74f1 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -100,7 +100,8 @@ DirectXTargetMachine::DirectXTargetMachine(const Target &T, const Triple &TT, DirectXTargetMachine::~DirectXTargetMachine() {} -void DirectXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void DirectXTargetMachine::registerPassBuilderCallbacks( + PassBuilder &PB, bool PopulateClassToPassNames) { PB.registerPipelineParsingCallback( [](StringRef PassName, ModulePassManager &PM, ArrayRef<PassBuilder::PipelineElement>) { diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.h b/llvm/lib/Target/DirectX/DirectXTargetMachine.h index d04c375b2736..428beaf61cd0 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.h +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.h @@ -47,7 +47,8 @@ public: } TargetTransformInfo getTargetTransformInfo(const Function &F) const override; - void registerPassBuilderCallbacks(PassBuilder &PB) override; + void registerPassBuilderCallbacks(PassBuilder &PB, + bool PopulateClassToPassNames) override; }; } // namespace llvm diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index eb5c59672224..defb1f7324f4 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -743,7 +743,7 @@ void HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) { // void HexagonDAGToDAGISel::SelectConstant(SDNode *N) { if (N->getValueType(0) == MVT::i1) { - assert(!(cast<ConstantSDNode>(N)->getZExtValue() >> 1)); + assert(!(N->getAsZExtVal() >> 1)); unsigned Opc = (cast<ConstantSDNode>(N)->getSExtValue() != 0) ? Hexagon::PS_true : Hexagon::PS_false; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 665e2d79c83d..81035849491b 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -1256,7 +1256,7 @@ HexagonTargetLowering::extractHvxSubvectorReg(SDValue OrigOp, SDValue VecV, SDValue IdxV, const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const { MVT VecTy = ty(VecV); unsigned HwLen = Subtarget.getVectorLength(); - unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue(); + unsigned Idx = IdxV.getNode()->getAsZExtVal(); MVT ElemTy = VecTy.getVectorElementType(); unsigned ElemWidth = ElemTy.getSizeInBits(); @@ -1299,7 +1299,7 @@ HexagonTargetLowering::extractHvxSubvectorPred(SDValue VecV, SDValue IdxV, MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen); SDValue ByteVec = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, VecV); // IdxV is required to be a constant. - unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue(); + unsigned Idx = IdxV.getNode()->getAsZExtVal(); unsigned ResLen = ResTy.getVectorNumElements(); unsigned BitBytes = HwLen / VecTy.getVectorNumElements(); @@ -1801,7 +1801,7 @@ HexagonTargetLowering::LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) MVT SrcTy = ty(SrcV); MVT DstTy = ty(Op); SDValue IdxV = Op.getOperand(1); - unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue(); + unsigned Idx = IdxV.getNode()->getAsZExtVal(); assert(Idx % DstTy.getVectorNumElements() == 0); (void)Idx; const SDLoc &dl(Op); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 590e464e1653..e7a692d67ba0 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -274,7 +274,8 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const { return I.get(); } -void HexagonTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void HexagonTargetMachine::registerPassBuilderCallbacks( + PassBuilder &PB, bool PopulateClassToPassNames) { PB.registerLateLoopOptimizationsEPCallback( [=](LoopPassManager &LPM, OptimizationLevel Level) { LPM.addPass(HexagonLoopIdiomRecognitionPass()); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h index dddd79ad1fcf..c5fed0cd65a8 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.h @@ -34,7 +34,8 @@ public: ~HexagonTargetMachine() override; const HexagonSubtarget *getSubtargetImpl(const Function &F) const override; - void registerPassBuilderCallbacks(PassBuilder &PB) override; + void registerPassBuilderCallbacks(PassBuilder &PB, + bool PopulateClassToPassNames) override; TargetPassConfig *createPassConfig(PassManagerBase &PM) override; TargetTransformInfo getTargetTransformInfo(const Function &F) const override; diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp index 17d7ffb586f4..06de2ff1ae3e 100644 --- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp @@ -166,6 +166,8 @@ LanaiTargetLowering::LanaiTargetLowering(const TargetMachine &TM, // Booleans always contain 0 or 1. setBooleanContents(ZeroOrOneBooleanContent); + + setMaxAtomicSizeInBitsSupported(0); } SDValue LanaiTargetLowering::LowerOperation(SDValue Op, diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp index 039182b3ffe6..33479720183b 100644 --- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -93,6 +93,7 @@ public: return getTM<LanaiTargetMachine>(); } + void addIRPasses() override; bool addInstSelector() override; void addPreSched2() override; void addPreEmitPass() override; @@ -104,6 +105,12 @@ LanaiTargetMachine::createPassConfig(PassManagerBase &PassManager) { return new LanaiPassConfig(*this, &PassManager); } +void LanaiPassConfig::addIRPasses() { + addPass(createAtomicExpandPass()); + + TargetPassConfig::addIRPasses(); +} + // Install an instruction selector pass. bool LanaiPassConfig::addInstSelector() { addPass(createLanaiISelDag(getLanaiTargetMachine())); diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index 66a37fce5dda..46f63a4103f9 100644 --- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -121,6 +121,10 @@ class LoongArchAsmParser : public MCTargetAsmParser { // Helper to emit pseudo instruction "li.w/d $rd, $imm". void emitLoadImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out); + // Helper to emit pseudo instruction "call36 sym" or "tail36 $rj, sym". + void emitFuncCall36(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + bool IsTailCall); + public: enum LoongArchMatchResultTy { Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY, @@ -400,6 +404,22 @@ public: IsValidKind; } + bool isSImm20pcaddu18i() const { + if (!isImm()) + return false; + + int64_t Imm; + LoongArchMCExpr::VariantKind VK = LoongArchMCExpr::VK_LoongArch_None; + bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); + bool IsValidKind = VK == LoongArchMCExpr::VK_LoongArch_None || + VK == LoongArchMCExpr::VK_LoongArch_CALL36; + + return IsConstantImm + ? isInt<20>(Imm) && IsValidKind + : LoongArchAsmParser::classifySymbolRef(getImm(), VK) && + IsValidKind; + } + bool isSImm21lsl2() const { if (!isImm()) return false; @@ -1110,6 +1130,35 @@ void LoongArchAsmParser::emitLoadImm(MCInst &Inst, SMLoc IDLoc, } } +void LoongArchAsmParser::emitFuncCall36(MCInst &Inst, SMLoc IDLoc, + MCStreamer &Out, bool IsTailCall) { + // call36 sym + // expands to: + // pcaddu18i $ra, %call36(sym) + // jirl $ra, $ra, 0 + // + // tail36 $rj, sym + // expands to: + // pcaddu18i $rj, %call36(sym) + // jirl $r0, $rj, 0 + unsigned ScratchReg = + IsTailCall ? Inst.getOperand(0).getReg() : (unsigned)LoongArch::R1; + const MCExpr *Sym = + IsTailCall ? Inst.getOperand(1).getExpr() : Inst.getOperand(0).getExpr(); + const LoongArchMCExpr *LE = LoongArchMCExpr::create( + Sym, llvm::LoongArchMCExpr::VK_LoongArch_CALL36, getContext()); + + Out.emitInstruction( + MCInstBuilder(LoongArch::PCADDU18I).addReg(ScratchReg).addExpr(LE), + getSTI()); + Out.emitInstruction( + MCInstBuilder(LoongArch::JIRL) + .addReg(IsTailCall ? (unsigned)LoongArch::R0 : ScratchReg) + .addReg(ScratchReg) + .addImm(0), + getSTI()); +} + bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands, MCStreamer &Out) { @@ -1158,6 +1207,12 @@ bool LoongArchAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, case LoongArch::PseudoLI_D: emitLoadImm(Inst, IDLoc, Out); return false; + case LoongArch::PseudoCALL36: + emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/false); + return false; + case LoongArch::PseudoTAIL36: + emitFuncCall36(Inst, IDLoc, Out, /*IsTailCall=*/true); + return false; } Out.emitInstruction(Inst, getSTI()); return false; @@ -1439,6 +1494,12 @@ bool LoongArchAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, /*Upper=*/(1 << 19) - 1, "operand must be a symbol with modifier (e.g. %pc_hi20) or an integer " "in the range"); + case Match_InvalidSImm20pcaddu18i: + return generateImmOutOfRangeError( + Operands, ErrorInfo, /*Lower=*/-(1 << 19), + /*Upper=*/(1 << 19) - 1, + "operand must be a symbol with modifier (e.g. %call36) or an integer " + "in the range"); case Match_InvalidSImm21lsl2: return generateImmOutOfRangeError( Operands, ErrorInfo, /*Lower=*/-(1 << 22), /*Upper=*/(1 << 22) - 4, diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp index 72c1f1cec198..ad39658f698e 100644 --- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp @@ -62,43 +62,24 @@ private: MachineBasicBlock::iterator &NextMBBI, unsigned FlagsHi, unsigned SecondOpcode, unsigned FlagsLo); - bool expandLargeAddressLoad(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, - unsigned LastOpcode, unsigned IdentifyingMO); - bool expandLargeAddressLoad(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, - unsigned LastOpcode, unsigned IdentifyingMO, - const MachineOperand &Symbol, Register DestReg, - bool EraseFromParent); bool expandLoadAddressPcrel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, - bool Large = false); + MachineBasicBlock::iterator &NextMBBI); bool expandLoadAddressGot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, - bool Large = false); + MachineBasicBlock::iterator &NextMBBI); bool expandLoadAddressTLSLE(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); bool expandLoadAddressTLSIE(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, - bool Large = false); + MachineBasicBlock::iterator &NextMBBI); bool expandLoadAddressTLSLD(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, - bool Large = false); + MachineBasicBlock::iterator &NextMBBI); bool expandLoadAddressTLSGD(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, - bool Large = false); - bool expandFunctionCALL(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, - bool IsTailCall); + MachineBasicBlock::iterator &NextMBBI); }; char LoongArchPreRAExpandPseudo::ID = 0; @@ -131,30 +112,16 @@ bool LoongArchPreRAExpandPseudo::expandMI( switch (MBBI->getOpcode()) { case LoongArch::PseudoLA_PCREL: return expandLoadAddressPcrel(MBB, MBBI, NextMBBI); - case LoongArch::PseudoLA_PCREL_LARGE: - return expandLoadAddressPcrel(MBB, MBBI, NextMBBI, /*Large=*/true); case LoongArch::PseudoLA_GOT: return expandLoadAddressGot(MBB, MBBI, NextMBBI); - case LoongArch::PseudoLA_GOT_LARGE: - return expandLoadAddressGot(MBB, MBBI, NextMBBI, /*Large=*/true); case LoongArch::PseudoLA_TLS_LE: return expandLoadAddressTLSLE(MBB, MBBI, NextMBBI); case LoongArch::PseudoLA_TLS_IE: return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI); - case LoongArch::PseudoLA_TLS_IE_LARGE: - return expandLoadAddressTLSIE(MBB, MBBI, NextMBBI, /*Large=*/true); case LoongArch::PseudoLA_TLS_LD: return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI); - case LoongArch::PseudoLA_TLS_LD_LARGE: - return expandLoadAddressTLSLD(MBB, MBBI, NextMBBI, /*Large=*/true); case LoongArch::PseudoLA_TLS_GD: return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI); - case LoongArch::PseudoLA_TLS_GD_LARGE: - return expandLoadAddressTLSGD(MBB, MBBI, NextMBBI, /*Large=*/true); - case LoongArch::PseudoCALL: - return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); - case LoongArch::PseudoTAIL: - return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true); } return false; } @@ -187,118 +154,9 @@ bool LoongArchPreRAExpandPseudo::expandPcalau12iInstPair( return true; } -bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, - unsigned IdentifyingMO) { - MachineInstr &MI = *MBBI; - return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO, - MI.getOperand(2), MI.getOperand(0).getReg(), - true); -} - -bool LoongArchPreRAExpandPseudo::expandLargeAddressLoad( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, - unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg, - bool EraseFromParent) { - // Code Sequence: - // - // Part1: pcalau12i $scratch, %MO1(sym) - // Part0: addi.d $dest, $zero, %MO0(sym) - // Part2: lu32i.d $dest, %MO2(sym) - // Part3: lu52i.d $dest, $dest, %MO3(sym) - // Fin: LastOpcode $dest, $dest, $scratch - - unsigned MO0, MO1, MO2, MO3; - switch (IdentifyingMO) { - default: - llvm_unreachable("unsupported identifying MO"); - case LoongArchII::MO_PCREL_LO: - MO0 = IdentifyingMO; - MO1 = LoongArchII::MO_PCREL_HI; - MO2 = LoongArchII::MO_PCREL64_LO; - MO3 = LoongArchII::MO_PCREL64_HI; - break; - case LoongArchII::MO_GOT_PC_HI: - case LoongArchII::MO_LD_PC_HI: - case LoongArchII::MO_GD_PC_HI: - // These cases relocate just like the GOT case, except for Part1. - MO0 = LoongArchII::MO_GOT_PC_LO; - MO1 = IdentifyingMO; - MO2 = LoongArchII::MO_GOT_PC64_LO; - MO3 = LoongArchII::MO_GOT_PC64_HI; - break; - case LoongArchII::MO_IE_PC_LO: - MO0 = IdentifyingMO; - MO1 = LoongArchII::MO_IE_PC_HI; - MO2 = LoongArchII::MO_IE_PC64_LO; - MO3 = LoongArchII::MO_IE_PC64_HI; - break; - } - - MachineFunction *MF = MBB.getParent(); - MachineInstr &MI = *MBBI; - DebugLoc DL = MI.getDebugLoc(); - - assert(MF->getSubtarget<LoongArchSubtarget>().is64Bit() && - "Large code model requires LA64"); - - Register TmpPart1 = - MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass); - Register TmpPart0 = - DestReg.isVirtual() - ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) - : DestReg; - Register TmpParts02 = - DestReg.isVirtual() - ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) - : DestReg; - Register TmpParts023 = - DestReg.isVirtual() - ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) - : DestReg; - - auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), TmpPart1); - auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), TmpPart0) - .addReg(LoongArch::R0); - auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), TmpParts02) - // "rj" is needed due to InstrInfo pattern requirement. - .addReg(TmpPart0, RegState::Kill); - auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), TmpParts023) - .addReg(TmpParts02, RegState::Kill); - BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg) - .addReg(TmpParts023) - .addReg(TmpPart1, RegState::Kill); - - if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) { - const char *SymName = Symbol.getSymbolName(); - Part0.addExternalSymbol(SymName, MO0); - Part1.addExternalSymbol(SymName, MO1); - Part2.addExternalSymbol(SymName, MO2); - Part3.addExternalSymbol(SymName, MO3); - } else { - Part0.addDisp(Symbol, 0, MO0); - Part1.addDisp(Symbol, 0, MO1); - Part2.addDisp(Symbol, 0, MO2); - Part3.addDisp(Symbol, 0, MO3); - } - - if (EraseFromParent) - MI.eraseFromParent(); - - return true; -} - bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, bool Large) { - if (Large) - // Emit the 5-insn large address load sequence with the `%pc` family of - // relocs. - return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, - LoongArchII::MO_PCREL_LO); - + MachineBasicBlock::iterator &NextMBBI) { // Code Sequence: // pcalau12i $rd, %pc_hi20(sym) // addi.w/d $rd, $rd, %pc_lo12(sym) @@ -311,13 +169,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressPcrel( bool LoongArchPreRAExpandPseudo::expandLoadAddressGot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, bool Large) { - if (Large) - // Emit the 5-insn large address load sequence with the `%got_pc` family - // of relocs, loading the result from GOT with `ldx.d` in the end. - return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, - LoongArchII::MO_GOT_PC_HI); - + MachineBasicBlock::iterator &NextMBBI) { // Code Sequence: // pcalau12i $rd, %got_pc_hi20(sym) // ld.w/d $rd, $rd, %got_pc_lo12(sym) @@ -378,13 +230,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLE( bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, bool Large) { - if (Large) - // Emit the 5-insn large address load sequence with the `%ie_pc` family - // of relocs, loading the result with `ldx.d` in the end. - return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, - LoongArchII::MO_IE_PC_LO); - + MachineBasicBlock::iterator &NextMBBI) { // Code Sequence: // pcalau12i $rd, %ie_pc_hi20(sym) // ld.w/d $rd, $rd, %ie_pc_lo12(sym) @@ -397,13 +243,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSIE( bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, bool Large) { - if (Large) - // Emit the 5-insn large address load sequence with the `%got_pc` family - // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`. - return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, - LoongArchII::MO_LD_PC_HI); - + MachineBasicBlock::iterator &NextMBBI) { // Code Sequence: // pcalau12i $rd, %ld_pc_hi20(sym) // addi.w/d $rd, $rd, %got_pc_lo12(sym) @@ -416,13 +256,7 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSLD( bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, bool Large) { - if (Large) - // Emit the 5-insn large address load sequence with the `%got_pc` family - // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`. - return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, - LoongArchII::MO_GD_PC_HI); - + MachineBasicBlock::iterator &NextMBBI) { // Code Sequence: // pcalau12i $rd, %gd_pc_hi20(sym) // addi.w/d $rd, $rd, %got_pc_lo12(sym) @@ -433,88 +267,6 @@ bool LoongArchPreRAExpandPseudo::expandLoadAddressTLSGD( SecondOpcode, LoongArchII::MO_GOT_PC_LO); } -bool LoongArchPreRAExpandPseudo::expandFunctionCALL( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) { - MachineFunction *MF = MBB.getParent(); - MachineInstr &MI = *MBBI; - DebugLoc DL = MI.getDebugLoc(); - const MachineOperand &Func = MI.getOperand(0); - MachineInstrBuilder CALL; - unsigned Opcode; - - switch (MF->getTarget().getCodeModel()) { - default: - report_fatal_error("Unsupported code model"); - break; - case CodeModel::Small: { - // CALL: - // bl func - // TAIL: - // b func - Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL; - CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func); - break; - } - case CodeModel::Medium: { - // CALL: - // pcalau12i $ra, %pc_hi20(func) - // jirl $ra, $ra, %pc_lo12(func) - // TAIL: - // pcalau12i $scratch, %pc_hi20(func) - // jirl $r0, $scratch, %pc_lo12(func) - Opcode = - IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; - Register ScratchReg = - IsTailCall - ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) - : LoongArch::R1; - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), ScratchReg); - CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg); - if (Func.isSymbol()) { - const char *FnName = Func.getSymbolName(); - MIB.addExternalSymbol(FnName, LoongArchII::MO_PCREL_HI); - CALL.addExternalSymbol(FnName, LoongArchII::MO_PCREL_LO); - break; - } - assert(Func.isGlobal() && "Expected a GlobalValue at this time"); - const GlobalValue *GV = Func.getGlobal(); - MIB.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_HI); - CALL.addGlobalAddress(GV, 0, LoongArchII::MO_PCREL_LO); - break; - } - case CodeModel::Large: { - // Emit the 5-insn large address load sequence, either directly or - // indirectly in case of going through the GOT, then JIRL_TAIL or - // JIRL_CALL to $addr. - Opcode = - IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; - Register AddrReg = - IsTailCall - ? MF->getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass) - : LoongArch::R1; - - bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal(); - unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO; - unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D; - expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg, - false); - CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0); - break; - } - } - - // Transfer implicit operands. - CALL.copyImplicitOps(MI); - - // Transfer MI flags. - CALL.setMIFlags(MI.getFlags()); - - MI.eraseFromParent(); - return true; -} - class LoongArchExpandPseudo : public MachineFunctionPass { public: const LoongArchInstrInfo *TII; @@ -536,6 +288,35 @@ private: MachineBasicBlock::iterator &NextMBBI); bool expandCopyCFR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); + bool expandLargeAddressLoad(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + unsigned LastOpcode, unsigned IdentifyingMO); + bool expandLargeAddressLoad(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + unsigned LastOpcode, unsigned IdentifyingMO, + const MachineOperand &Symbol, Register DestReg, + bool EraseFromParent); + bool expandLoadAddressPcrelLarge(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressGotLarge(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressTLSIELarge(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressTLSLDLarge(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandLoadAddressTLSGDLarge(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI); + bool expandFunctionCALL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, + bool IsTailCall); }; char LoongArchExpandPseudo::ID = 0; @@ -570,6 +351,24 @@ bool LoongArchExpandPseudo::expandMI(MachineBasicBlock &MBB, switch (MBBI->getOpcode()) { case LoongArch::PseudoCopyCFR: return expandCopyCFR(MBB, MBBI, NextMBBI); + case LoongArch::PseudoLA_PCREL_LARGE: + return expandLoadAddressPcrelLarge(MBB, MBBI, NextMBBI); + case LoongArch::PseudoLA_GOT_LARGE: + return expandLoadAddressGotLarge(MBB, MBBI, NextMBBI); + case LoongArch::PseudoLA_TLS_IE_LARGE: + return expandLoadAddressTLSIELarge(MBB, MBBI, NextMBBI); + case LoongArch::PseudoLA_TLS_LD_LARGE: + return expandLoadAddressTLSLDLarge(MBB, MBBI, NextMBBI); + case LoongArch::PseudoLA_TLS_GD_LARGE: + return expandLoadAddressTLSGDLarge(MBB, MBBI, NextMBBI); + case LoongArch::PseudoCALL: + case LoongArch::PseudoCALL_MEDIUM: + case LoongArch::PseudoCALL_LARGE: + return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/false); + case LoongArch::PseudoTAIL: + case LoongArch::PseudoTAIL_MEDIUM: + case LoongArch::PseudoTAIL_LARGE: + return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true); } return false; @@ -628,6 +427,212 @@ bool LoongArchExpandPseudo::expandCopyCFR( return true; } +bool LoongArchExpandPseudo::expandLargeAddressLoad( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, + unsigned IdentifyingMO) { + MachineInstr &MI = *MBBI; + return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LastOpcode, IdentifyingMO, + MI.getOperand(2), MI.getOperand(0).getReg(), + true); +} + +bool LoongArchExpandPseudo::expandLargeAddressLoad( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, unsigned LastOpcode, + unsigned IdentifyingMO, const MachineOperand &Symbol, Register DestReg, + bool EraseFromParent) { + // Code Sequence: + // + // Part1: pcalau12i $dst, %MO1(sym) + // Part0: addi.d $t8, $zero, %MO0(sym) + // Part2: lu32i.d $t8, %MO2(sym) + // Part3: lu52i.d $t8, $t8, %MO3(sym) + // Fin: LastOpcode $dst, $t8, $dst + + unsigned MO0, MO1, MO2, MO3; + switch (IdentifyingMO) { + default: + llvm_unreachable("unsupported identifying MO"); + case LoongArchII::MO_PCREL_LO: + MO0 = IdentifyingMO; + MO1 = LoongArchII::MO_PCREL_HI; + MO2 = LoongArchII::MO_PCREL64_LO; + MO3 = LoongArchII::MO_PCREL64_HI; + break; + case LoongArchII::MO_GOT_PC_HI: + case LoongArchII::MO_LD_PC_HI: + case LoongArchII::MO_GD_PC_HI: + // These cases relocate just like the GOT case, except for Part1. + MO0 = LoongArchII::MO_GOT_PC_LO; + MO1 = IdentifyingMO; + MO2 = LoongArchII::MO_GOT_PC64_LO; + MO3 = LoongArchII::MO_GOT_PC64_HI; + break; + case LoongArchII::MO_IE_PC_LO: + MO0 = IdentifyingMO; + MO1 = LoongArchII::MO_IE_PC_HI; + MO2 = LoongArchII::MO_IE_PC64_LO; + MO3 = LoongArchII::MO_IE_PC64_HI; + break; + } + + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + Register ScratchReg = LoongArch::R20; // $t8 + + assert(MBB.getParent()->getSubtarget<LoongArchSubtarget>().is64Bit() && + "Large code model requires LA64"); + + auto Part1 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCALAU12I), DestReg); + auto Part0 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::ADDI_D), ScratchReg) + .addReg(LoongArch::R0); + auto Part2 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU32I_D), ScratchReg) + // "rj" is needed due to InstrInfo pattern requirement. + .addReg(ScratchReg); + auto Part3 = BuildMI(MBB, MBBI, DL, TII->get(LoongArch::LU52I_D), ScratchReg) + .addReg(ScratchReg); + BuildMI(MBB, MBBI, DL, TII->get(LastOpcode), DestReg) + .addReg(ScratchReg) + .addReg(DestReg); + + if (Symbol.getType() == MachineOperand::MO_ExternalSymbol) { + const char *SymName = Symbol.getSymbolName(); + Part0.addExternalSymbol(SymName, MO0); + Part1.addExternalSymbol(SymName, MO1); + Part2.addExternalSymbol(SymName, MO2); + Part3.addExternalSymbol(SymName, MO3); + } else { + Part0.addDisp(Symbol, 0, MO0); + Part1.addDisp(Symbol, 0, MO1); + Part2.addDisp(Symbol, 0, MO2); + Part3.addDisp(Symbol, 0, MO3); + } + + if (EraseFromParent) + MI.eraseFromParent(); + + return true; +} + +bool LoongArchExpandPseudo::expandLoadAddressPcrelLarge( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + // Emit the 5-insn large address load sequence with the `%pc` family of + // relocs. + return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, + LoongArchII::MO_PCREL_LO); +} + +bool LoongArchExpandPseudo::expandLoadAddressGotLarge( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + // Emit the 5-insn large address load sequence with the `%got_pc` family + // of relocs, loading the result from GOT with `ldx.d` in the end. + return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, + LoongArchII::MO_GOT_PC_HI); +} + +bool LoongArchExpandPseudo::expandLoadAddressTLSIELarge( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + // Emit the 5-insn large address load sequence with the `%ie_pc` family + // of relocs, loading the result with `ldx.d` in the end. + return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::LDX_D, + LoongArchII::MO_IE_PC_LO); +} + +bool LoongArchExpandPseudo::expandLoadAddressTLSLDLarge( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + // Emit the 5-insn large address load sequence with the `%got_pc` family + // of relocs, with the `pcalau12i` insn relocated with `%ld_pc_hi20`. + return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, + LoongArchII::MO_LD_PC_HI); +} + +bool LoongArchExpandPseudo::expandLoadAddressTLSGDLarge( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI) { + // Emit the 5-insn large address load sequence with the `%got_pc` family + // of relocs, with the `pcalau12i` insn relocated with `%gd_pc_hi20`. + return expandLargeAddressLoad(MBB, MBBI, NextMBBI, LoongArch::ADD_D, + LoongArchII::MO_GD_PC_HI); +} + +bool LoongArchExpandPseudo::expandFunctionCALL( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineBasicBlock::iterator &NextMBBI, bool IsTailCall) { + MachineFunction *MF = MBB.getParent(); + MachineInstr &MI = *MBBI; + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Func = MI.getOperand(0); + MachineInstrBuilder CALL; + unsigned Opcode; + + switch (MF->getTarget().getCodeModel()) { + default: + report_fatal_error("Unsupported code model"); + break; + case CodeModel::Small: { + // CALL: + // bl func + // TAIL: + // b func + Opcode = IsTailCall ? LoongArch::PseudoB_TAIL : LoongArch::BL; + CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).add(Func); + break; + } + case CodeModel::Medium: { + // CALL: + // pcaddu18i $ra, %call36(func) + // jirl $ra, $ra, 0 + // TAIL: + // pcaddu18i $t8, %call36(func) + // jr $t8 + Opcode = + IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; + Register ScratchReg = IsTailCall ? LoongArch::R20 : LoongArch::R1; + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PCADDU18I), ScratchReg); + + CALL = + BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(ScratchReg).addImm(0); + + if (Func.isSymbol()) + MIB.addExternalSymbol(Func.getSymbolName(), LoongArchII::MO_CALL36); + else + MIB.addDisp(Func, 0, LoongArchII::MO_CALL36); + break; + } + case CodeModel::Large: { + // Emit the 5-insn large address load sequence, either directly or + // indirectly in case of going through the GOT, then JIRL_TAIL or + // JIRL_CALL to $addr. + Opcode = + IsTailCall ? LoongArch::PseudoJIRL_TAIL : LoongArch::PseudoJIRL_CALL; + Register AddrReg = IsTailCall ? LoongArch::R19 : LoongArch::R1; + + bool UseGOT = Func.isGlobal() && !Func.getGlobal()->isDSOLocal(); + unsigned MO = UseGOT ? LoongArchII::MO_GOT_PC_HI : LoongArchII::MO_PCREL_LO; + unsigned LAOpcode = UseGOT ? LoongArch::LDX_D : LoongArch::ADD_D; + expandLargeAddressLoad(MBB, MBBI, NextMBBI, LAOpcode, MO, Func, AddrReg, + false); + CALL = BuildMI(MBB, MBBI, DL, TII->get(Opcode)).addReg(AddrReg).addImm(0); + break; + } + } + + // Transfer implicit operands. + CALL.copyImplicitOps(MI); + + // Transfer MI flags. + CALL.setMIFlags(MI.getFlags()); + + MI.eraseFromParent(); + return true; +} + } // end namespace INITIALIZE_PASS(LoongArchPreRAExpandPseudo, "loongarch-prera-expand-pseudo", diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index e14bbadf9ed2..70f782b81270 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -525,8 +525,7 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, if (isa<ConstantSDNode>(Idx) && (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 || - EltTy == MVT::f64 || - cast<ConstantSDNode>(Idx)->getZExtValue() < NumElts / 2)) + EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2)) return Op; return SDValue(); @@ -762,12 +761,13 @@ static SDValue getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty, template <class NodeTy> SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, + CodeModel::Model M, bool IsLocal) const { SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue Addr = getTargetNode(N, DL, Ty, DAG, 0); - switch (DAG.getTarget().getCodeModel()) { + switch (M) { default: report_fatal_error("Unsupported code model"); @@ -808,24 +808,35 @@ SDValue LoongArchTargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, SDValue LoongArchTargetLowering::lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - return getAddr(cast<BlockAddressSDNode>(Op), DAG); + return getAddr(cast<BlockAddressSDNode>(Op), DAG, + DAG.getTarget().getCodeModel()); } SDValue LoongArchTargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const { - return getAddr(cast<JumpTableSDNode>(Op), DAG); + return getAddr(cast<JumpTableSDNode>(Op), DAG, + DAG.getTarget().getCodeModel()); } SDValue LoongArchTargetLowering::lowerConstantPool(SDValue Op, SelectionDAG &DAG) const { - return getAddr(cast<ConstantPoolSDNode>(Op), DAG); + return getAddr(cast<ConstantPoolSDNode>(Op), DAG, + DAG.getTarget().getCodeModel()); } SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op); assert(N->getOffset() == 0 && "unexpected offset in global node"); - return getAddr(N, DAG, N->getGlobal()->isDSOLocal()); + auto CM = DAG.getTarget().getCodeModel(); + const GlobalValue *GV = N->getGlobal(); + + if (GV->isDSOLocal() && isa<GlobalVariable>(GV)) { + if (auto GCM = dyn_cast<GlobalVariable>(GV)->getCodeModel()) + CM = *GCM; + } + + return getAddr(N, DAG, CM, GV->isDSOLocal()); } SDValue LoongArchTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N, @@ -1383,28 +1394,28 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op, if (IntrinsicEnum == Intrinsic::loongarch_cacop_w && Subtarget.is64Bit()) return emitIntrinsicErrorMessage(Op, ErrorMsgReqLA32, DAG); // call void @llvm.loongarch.cacop.[d/w](uimm5, rj, simm12) - unsigned Imm1 = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm1 = Op2->getAsZExtVal(); int Imm2 = cast<ConstantSDNode>(Op.getOperand(4))->getSExtValue(); if (!isUInt<5>(Imm1) || !isInt<12>(Imm2)) return emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG); return Op; } case Intrinsic::loongarch_dbar: { - unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm = Op2->getAsZExtVal(); return !isUInt<15>(Imm) ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG) : DAG.getNode(LoongArchISD::DBAR, DL, MVT::Other, Chain, DAG.getConstant(Imm, DL, GRLenVT)); } case Intrinsic::loongarch_ibar: { - unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm = Op2->getAsZExtVal(); return !isUInt<15>(Imm) ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG) : DAG.getNode(LoongArchISD::IBAR, DL, MVT::Other, Chain, DAG.getConstant(Imm, DL, GRLenVT)); } case Intrinsic::loongarch_break: { - unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm = Op2->getAsZExtVal(); return !isUInt<15>(Imm) ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG) : DAG.getNode(LoongArchISD::BREAK, DL, MVT::Other, Chain, @@ -1413,7 +1424,7 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op, case Intrinsic::loongarch_movgr2fcsr: { if (!Subtarget.hasBasicF()) return emitIntrinsicErrorMessage(Op, ErrorMsgReqF, DAG); - unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm = Op2->getAsZExtVal(); return !isUInt<2>(Imm) ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG) : DAG.getNode(LoongArchISD::MOVGR2FCSR, DL, MVT::Other, Chain, @@ -1422,7 +1433,7 @@ SDValue LoongArchTargetLowering::lowerINTRINSIC_VOID(SDValue Op, Op.getOperand(3))); } case Intrinsic::loongarch_syscall: { - unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm = Op2->getAsZExtVal(); return !isUInt<15>(Imm) ? emitIntrinsicErrorMessage(Op, ErrorMsgOOR, DAG) : DAG.getNode(LoongArchISD::SYSCALL, DL, MVT::Other, Chain, @@ -1925,7 +1936,7 @@ void LoongArchTargetLowering::ReplaceNodeResults( emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgReqF); return; } - unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm = Op2->getAsZExtVal(); if (!isUInt<2>(Imm)) { emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR); return; @@ -1981,7 +1992,7 @@ void LoongArchTargetLowering::ReplaceNodeResults( CSR_CASE(iocsrrd_d); #undef CSR_CASE case Intrinsic::loongarch_csrrd_w: { - unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm = Op2->getAsZExtVal(); if (!isUInt<14>(Imm)) { emitErrorAndReplaceIntrinsicResults(N, Results, DAG, ErrorMsgOOR); return; @@ -3381,8 +3392,12 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { // TODO: Add more target-dependent nodes later. NODE_NAME_CASE(CALL) + NODE_NAME_CASE(CALL_MEDIUM) + NODE_NAME_CASE(CALL_LARGE) NODE_NAME_CASE(RET) NODE_NAME_CASE(TAIL) + NODE_NAME_CASE(TAIL_MEDIUM) + NODE_NAME_CASE(TAIL_LARGE) NODE_NAME_CASE(SLL_W) NODE_NAME_CASE(SRA_W) NODE_NAME_CASE(SRL_W) @@ -4240,15 +4255,31 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, // Emit the call. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + unsigned Op; + switch (DAG.getTarget().getCodeModel()) { + default: + report_fatal_error("Unsupported code model"); + case CodeModel::Small: + Op = IsTailCall ? LoongArchISD::TAIL : LoongArchISD::CALL; + break; + case CodeModel::Medium: + assert(Subtarget.is64Bit() && "Medium code model requires LA64"); + Op = IsTailCall ? LoongArchISD::TAIL_MEDIUM : LoongArchISD::CALL_MEDIUM; + break; + case CodeModel::Large: + assert(Subtarget.is64Bit() && "Large code model requires LA64"); + Op = IsTailCall ? LoongArchISD::TAIL_LARGE : LoongArchISD::CALL_LARGE; + break; + } if (IsTailCall) { MF.getFrameInfo().setHasTailCall(); - SDValue Ret = DAG.getNode(LoongArchISD::TAIL, DL, NodeTys, Ops); + SDValue Ret = DAG.getNode(Op, DL, NodeTys, Ops); DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge); return Ret; } - Chain = DAG.getNode(LoongArchISD::CALL, DL, NodeTys, Ops); + Chain = DAG.getNode(Op, DL, NodeTys, Ops); DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); Glue = Chain.getValue(1); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 6f8878f9ccd5..72182623b2c3 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -27,8 +27,12 @@ enum NodeType : unsigned { // TODO: add more LoongArchISDs CALL, + CALL_MEDIUM, + CALL_LARGE, RET, TAIL, + TAIL_MEDIUM, + TAIL_LARGE, // 32-bit shifts, directly matching the semantics of the named LoongArch // instructions. @@ -250,7 +254,8 @@ private: LoongArchCCAssignFn Fn) const; template <class NodeTy> - SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const; + SDValue getAddr(NodeTy *N, SelectionDAG &DAG, CodeModel::Model M, + bool IsLocal = true) const; SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, unsigned Opc, bool Large = false) const; SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 2fea0f33e9eb..78074c012876 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -69,6 +69,18 @@ def loongarch_ret : SDNode<"LoongArchISD::RET", SDTNone, def loongarch_tail : SDNode<"LoongArchISD::TAIL", SDT_LoongArchCall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; +def loongarch_call_medium : SDNode<"LoongArchISD::CALL_MEDIUM", SDT_LoongArchCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def loongarch_tail_medium : SDNode<"LoongArchISD::TAIL_MEDIUM", SDT_LoongArchCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def loongarch_call_large : SDNode<"LoongArchISD::CALL_LARGE", SDT_LoongArchCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>; def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>; def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; @@ -377,6 +389,10 @@ def simm20_lu32id : SImm20Operand { let ParserMatchClass = SImmAsmOperand<20, "lu32id">; } +def simm20_pcaddu18i : SImm20Operand { + let ParserMatchClass = SImmAsmOperand<20, "pcaddu18i">; +} + def simm21_lsl2 : Operand<OtherVT> { let ParserMatchClass = SImmAsmOperand<21, "lsl2">; let EncoderMethod = "getImmOpValueAsr<2>"; @@ -832,7 +848,7 @@ def LU32I_D : Fmt1RI20<0x16000000, (outs GPR:$dst), "$rd, $imm20">; } def LU52I_D : ALU_2RI12<0x03000000, simm12_lu52id>; -def PCADDU18I : ALU_1RI20<0x1e000000, simm20>; +def PCADDU18I : ALU_1RI20<0x1e000000, simm20_pcaddu18i>; def MUL_D : ALU_3R<0x001d8000>; def MULH_D : ALU_3R<0x001e0000>; def MULH_DU : ALU_3R<0x001e8000>; @@ -1395,16 +1411,43 @@ def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>; def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)), (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>; +// Function call with 'Small' code model. let isCall = 1, Defs = [R1] in -def PseudoCALL : Pseudo<(outs), (ins simm26_symbol:$func)>; +def PseudoCALL : Pseudo<(outs), (ins bare_symbol:$func)>; def : Pat<(loongarch_call tglobaladdr:$func), (PseudoCALL tglobaladdr:$func)>; def : Pat<(loongarch_call texternalsym:$func), (PseudoCALL texternalsym:$func)>; +// Function call with 'Medium' code model. +let isCall = 1, Defs = [R1, R20], Size = 8 in +def PseudoCALL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$func)>; + +let Predicates = [IsLA64] in { +def : Pat<(loongarch_call_medium tglobaladdr:$func), + (PseudoCALL_MEDIUM tglobaladdr:$func)>; +def : Pat<(loongarch_call_medium texternalsym:$func), + (PseudoCALL_MEDIUM texternalsym:$func)>; +} // Predicates = [IsLA64] + +// Function call with 'Large' code model. +let isCall = 1, Defs = [R1, R20], Size = 24 in +def PseudoCALL_LARGE: Pseudo<(outs), (ins bare_symbol:$func)>; + +let Predicates = [IsLA64] in { +def : Pat<(loongarch_call_large tglobaladdr:$func), + (PseudoCALL_LARGE tglobaladdr:$func)>; +def : Pat<(loongarch_call_large texternalsym:$func), + (PseudoCALL_LARGE texternalsym:$func)>; +} // Predicates = [IsLA64] + let isCall = 1, Defs = [R1] in def PseudoCALLIndirect : Pseudo<(outs), (ins GPR:$rj), [(loongarch_call GPR:$rj)]>, PseudoInstExpansion<(JIRL R1, GPR:$rj, 0)>; +let Predicates = [IsLA64] in { +def : Pat<(loongarch_call_medium GPR:$rj), (PseudoCALLIndirect GPR:$rj)>; +def : Pat<(loongarch_call_large GPR:$rj), (PseudoCALLIndirect GPR:$rj)>; +} let isCall = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Defs = [R1] in def PseudoJIRL_CALL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>, @@ -1415,18 +1458,47 @@ let isBarrier = 1, isReturn = 1, isTerminator = 1 in def PseudoRET : Pseudo<(outs), (ins), [(loongarch_ret)]>, PseudoInstExpansion<(JIRL R0, R1, 0)>; +// Tail call with 'Small' code model. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in -def PseudoTAIL : Pseudo<(outs), (ins simm26_symbol:$dst)>; +def PseudoTAIL : Pseudo<(outs), (ins bare_symbol:$dst)>; def : Pat<(loongarch_tail (iPTR tglobaladdr:$dst)), (PseudoTAIL tglobaladdr:$dst)>; def : Pat<(loongarch_tail (iPTR texternalsym:$dst)), (PseudoTAIL texternalsym:$dst)>; +// Tail call with 'Medium' code model. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + Uses = [R3], Defs = [R20], Size = 8 in +def PseudoTAIL_MEDIUM : Pseudo<(outs), (ins bare_symbol:$dst)>; + +let Predicates = [IsLA64] in { +def : Pat<(loongarch_tail_medium (iPTR tglobaladdr:$dst)), + (PseudoTAIL_MEDIUM tglobaladdr:$dst)>; +def : Pat<(loongarch_tail_medium (iPTR texternalsym:$dst)), + (PseudoTAIL_MEDIUM texternalsym:$dst)>; +} // Predicates = [IsLA64] + +// Tail call with 'Large' code model. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + Uses = [R3], Defs = [R19, R20], Size = 24 in +def PseudoTAIL_LARGE : Pseudo<(outs), (ins bare_symbol:$dst)>; + +let Predicates = [IsLA64] in { +def : Pat<(loongarch_tail_large (iPTR tglobaladdr:$dst)), + (PseudoTAIL_LARGE tglobaladdr:$dst)>; +def : Pat<(loongarch_tail_large (iPTR texternalsym:$dst)), + (PseudoTAIL_LARGE texternalsym:$dst)>; +} // Predicates = [IsLA64] + let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3] in def PseudoTAILIndirect : Pseudo<(outs), (ins GPRT:$rj), [(loongarch_tail GPRT:$rj)]>, PseudoInstExpansion<(JIRL R0, GPR:$rj, 0)>; +let Predicates = [IsLA64] in { +def : Pat<(loongarch_tail_medium GPR:$rj), (PseudoTAILIndirect GPR:$rj)>; +def : Pat<(loongarch_tail_large GPR:$rj), (PseudoTAILIndirect GPR:$rj)>; +} let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasSideEffects = 0, mayStore = 0, mayLoad = 0, Uses = [R3] in @@ -1439,6 +1511,19 @@ def PseudoJIRL_TAIL : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>, PseudoInstExpansion<(JIRL R0, GPR:$rj, simm16_lsl2:$imm16)>; +/// call36/taill36 macro instructions +let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, isAsmParserOnly = 1, + Defs = [R1], Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in +def PseudoCALL36 : Pseudo<(outs), (ins bare_symbol:$dst), [], + "call36", "$dst">, + Requires<[IsLA64]>; +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [R3], + isCodeGenOnly = 0, isAsmParserOnly = 1, Size = 8, hasSideEffects = 0, + mayStore = 0, mayLoad = 0 in +def PseudoTAIL36 : Pseudo<(outs), (ins GPR:$tmp, bare_symbol:$dst), [], + "tail36", "$tmp, $dst">, + Requires<[IsLA64]>; + /// Load address (la*) macro instructions. // Define isCodeGenOnly = 0 to expose them to tablegened assembly parser. @@ -1451,6 +1536,7 @@ def PseudoLA_ABS_LARGE : Pseudo<(outs GPR:$dst), "la.abs", "$dst, $src">; def PseudoLA_PCREL : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la.pcrel", "$dst, $src">; +let Defs = [R20], Size = 20 in def PseudoLA_PCREL_LARGE : Pseudo<(outs GPR:$dst), (ins GPR:$tmp, bare_symbol:$src), [], "la.pcrel", "$dst, $tmp, $src">, @@ -1462,28 +1548,30 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 0, isAsmParserOnly = 1 in { def PseudoLA_GOT : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], "la.got", "$dst, $src">; +def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.tls.ie", "$dst, $src">; +def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.tls.ld", "$dst, $src">; +def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], + "la.tls.gd", "$dst, $src">; +let Defs = [R20], Size = 20 in { def PseudoLA_GOT_LARGE : Pseudo<(outs GPR:$dst), (ins GPR:$tmp, bare_symbol:$src), [], "la.got", "$dst, $tmp, $src">, Requires<[IsLA64]>; -def PseudoLA_TLS_IE : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], - "la.tls.ie", "$dst, $src">; def PseudoLA_TLS_IE_LARGE : Pseudo<(outs GPR:$dst), (ins GPR:$tmp, bare_symbol:$src), [], "la.tls.ie", "$dst, $tmp, $src">, Requires<[IsLA64]>; -def PseudoLA_TLS_LD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], - "la.tls.ld", "$dst, $src">; def PseudoLA_TLS_LD_LARGE : Pseudo<(outs GPR:$dst), (ins GPR:$tmp, bare_symbol:$src), [], "la.tls.ld", "$dst, $tmp, $src">, Requires<[IsLA64]>; -def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [], - "la.tls.gd", "$dst, $src">; def PseudoLA_TLS_GD_LARGE : Pseudo<(outs GPR:$dst), (ins GPR:$tmp, bare_symbol:$src), [], "la.tls.gd", "$dst, $tmp, $src">, Requires<[IsLA64]>; +} // Defs = [R20], Size = 20 } // Load address inst alias: "la", "la.global" and "la.local". diff --git a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp index 5daa9481c907..98ad49f25e3f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchMCInstLower.cpp @@ -95,6 +95,9 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym, case LoongArchII::MO_GD_PC_HI: Kind = LoongArchMCExpr::VK_LoongArch_TLS_GD_PC_HI20; break; + case LoongArchII::MO_CALL36: + Kind = LoongArchMCExpr::VK_LoongArch_CALL36; + break; // TODO: Handle more target-flags. } diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp index 257b947a3ce4..092b5f1fb442 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp @@ -15,6 +15,7 @@ #include "LoongArch.h" #include "LoongArchInstrInfo.h" #include "LoongArchSubtarget.h" +#include "MCTargetDesc/LoongArchBaseInfo.h" #include "MCTargetDesc/LoongArchMCTargetDesc.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -194,3 +195,25 @@ bool LoongArchRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); return false; } + +bool LoongArchRegisterInfo::canRealignStack(const MachineFunction &MF) const { + if (!TargetRegisterInfo::canRealignStack(MF)) + return false; + + const MachineRegisterInfo *MRI = &MF.getRegInfo(); + const LoongArchFrameLowering *TFI = getFrameLowering(MF); + + // Stack realignment requires a frame pointer. If we already started + // register allocation with frame pointer elimination, it is too late now. + if (!MRI->canReserveReg(LoongArch::R22)) + return false; + + // We may also need a base pointer if there are dynamic allocas or stack + // pointer adjustments around calls. + if (TFI->hasReservedCallFrame(MF)) + return true; + + // A base pointer is required and allowed. Check that it isn't too late to + // reserve it. + return MRI->canReserveReg(LoongArchABI::getBPReg()); +} diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h index 7e8f26b14097..d1e40254c297 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.h @@ -51,6 +51,7 @@ struct LoongArchRegisterInfo : public LoongArchGenRegisterInfo { bool requiresFrameIndexScavenging(const MachineFunction &MF) const override { return true; } + bool canRealignStack(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index a5a4d78aceee..62ae1dea00d6 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -63,11 +63,11 @@ getEffectiveLoongArchCodeModel(const Triple &TT, switch (*CM) { case CodeModel::Small: - case CodeModel::Medium: return *CM; + case CodeModel::Medium: case CodeModel::Large: if (!TT.isArch64Bit()) - report_fatal_error("Large code model requires LA64"); + report_fatal_error("Medium/Large code model requires LA64"); return *CM; default: report_fatal_error( diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 6d8ef1bf96cb..518f6b10edab 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -91,6 +91,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, case FK_Data_2: case FK_Data_4: case FK_Data_8: + case FK_Data_leb128: return Value; case LoongArch::fixup_loongarch_b16: { if (!isInt<18>(Value)) @@ -128,6 +129,15 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } } +static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, + MutableArrayRef<char> Data, uint64_t Value) { + unsigned I; + for (I = 0; I != Data.size() && Value; ++I, Value >>= 7) + Data[I] |= uint8_t(Value & 0x7f); + if (Value) + Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!"); +} + void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, @@ -143,6 +153,10 @@ void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm, MCFixupKindInfo Info = getFixupKindInfo(Kind); MCContext &Ctx = Asm.getContext(); + // Fixup leb128 separately. + if (Fixup.getTargetKind() == FK_Data_leb128) + return fixupLeb128(Ctx, Fixup, Data, Value); + // Apply any target-specific value adjustments. Value = adjustFixupValue(Fixup, Value, Ctx); @@ -173,6 +187,7 @@ bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm, case FK_Data_2: case FK_Data_4: case FK_Data_8: + case FK_Data_leb128: return !Target.isAbsolute(); } } @@ -202,9 +217,24 @@ getRelocPairForSize(unsigned Size) { return std::make_pair( MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD64), MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB64)); + case 128: + return std::make_pair( + MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_ADD_ULEB128), + MCFixupKind(FirstLiteralRelocationKind + ELF::R_LARCH_SUB_ULEB128)); } } +std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCLEBFragment &LF, + MCAsmLayout &Layout, + int64_t &Value) const { + const MCExpr &Expr = LF.getValue(); + if (LF.isSigned() || !Expr.evaluateKnownAbsolute(Value, Layout)) + return std::make_pair(false, false); + LF.getFixups().push_back( + MCFixup::create(0, &Expr, FK_Data_leb128, Expr.getLoc())); + return std::make_pair(true, true); +} + bool LoongArchAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const { // We mostly follow binutils' convention here: align to 4-byte boundary with a @@ -226,21 +256,27 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout, uint64_t &FixedValue) const { std::pair<MCFixupKind, MCFixupKind> FK; uint64_t FixedValueA, FixedValueB; - const MCSection &SecA = Target.getSymA()->getSymbol().getSection(); - const MCSection &SecB = Target.getSymB()->getSymbol().getSection(); - - // We need record relocation if SecA != SecB. Usually SecB is same as the - // section of Fixup, which will be record the relocation as PCRel. If SecB - // is not same as the section of Fixup, it will report error. Just return - // false and then this work can be finished by handleFixup. - if (&SecA != &SecB) - return false; - - // In SecA == SecB case. If the linker relaxation is enabled, we need record - // the ADD, SUB relocations. Otherwise the FixedValue has already been - // calculated out in evaluateFixup, return true and avoid record relocations. - if (!STI.hasFeature(LoongArch::FeatureRelax)) - return true; + const MCSymbol &SA = Target.getSymA()->getSymbol(); + const MCSymbol &SB = Target.getSymB()->getSymbol(); + + bool force = !SA.isInSection() || !SB.isInSection(); + if (!force) { + const MCSection &SecA = SA.getSection(); + const MCSection &SecB = SB.getSection(); + + // We need record relocation if SecA != SecB. Usually SecB is same as the + // section of Fixup, which will be record the relocation as PCRel. If SecB + // is not same as the section of Fixup, it will report error. Just return + // false and then this work can be finished by handleFixup. + if (&SecA != &SecB) + return false; + + // In SecA == SecB case. If the linker relaxation is enabled, we need record + // the ADD, SUB relocations. Otherwise the FixedValue has already been calc- + // ulated out in evaluateFixup, return true and avoid record relocations. + if (!STI.hasFeature(LoongArch::FeatureRelax)) + return true; + } switch (Fixup.getKind()) { case llvm::FK_Data_1: @@ -255,6 +291,9 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAsmLayout &Layout, case llvm::FK_Data_8: FK = getRelocPairForSize(64); break; + case llvm::FK_Data_leb128: + FK = getRelocPairForSize(128); + break; default: llvm_unreachable("unsupported fixup size"); } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index fef0e84600a7..71977217f59b 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -66,6 +66,9 @@ public: void relaxInstruction(MCInst &Inst, const MCSubtargetInfo &STI) const override {} + std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, + int64_t &Value) const override; + bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h index cee6dad1f095..0692cb92b694 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.h @@ -47,6 +47,7 @@ enum { MO_IE_PC64_HI, MO_LD_PC_HI, MO_GD_PC_HI, + MO_CALL36 // TODO: Add more flags. }; } // end namespace LoongArchII diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index fe19a4f2d3c8..1dec816f3473 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -90,6 +90,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_LARCH_TLS_LE64_LO20; case LoongArch::fixup_loongarch_tls_le64_hi12: return ELF::R_LARCH_TLS_LE64_HI12; + case LoongArch::fixup_loongarch_call36: + return ELF::R_LARCH_CALL36; // TODO: Handle more fixup-kinds. } } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h index 178fa6e5262b..e827bae1f3e3 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h @@ -108,7 +108,10 @@ enum Fixups { // 20-bit fixup corresponding to %gd_hi20(foo) for instruction lu12i.w. fixup_loongarch_tls_gd_hi20, // Generate an R_LARCH_RELAX which indicates the linker may relax here. - fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX + fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX, + // 36-bit fixup corresponding to %call36(foo) for a pair instructions: + // pcaddu18i+jirl. + fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36, }; } // end namespace LoongArch } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp index d2ea062dc09a..9ac0128f2517 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp @@ -241,6 +241,9 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO, case LoongArchMCExpr::VK_LoongArch_TLS_GD_HI20: FixupKind = LoongArch::fixup_loongarch_tls_gd_hi20; break; + case LoongArchMCExpr::VK_LoongArch_CALL36: + FixupKind = LoongArch::fixup_loongarch_call36; + break; } } else if (Kind == MCExpr::SymbolRef && cast<MCSymbolRefExpr>(Expr)->getKind() == diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp index 82c992b1cc8c..8ca8876a19b9 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp @@ -138,6 +138,8 @@ StringRef LoongArchMCExpr::getVariantKindName(VariantKind Kind) { return "gd_pc_hi20"; case VK_LoongArch_TLS_GD_HI20: return "gd_hi20"; + case VK_LoongArch_CALL36: + return "call36"; } } @@ -180,6 +182,7 @@ LoongArchMCExpr::getVariantKindForName(StringRef name) { .Case("ld_hi20", VK_LoongArch_TLS_LD_HI20) .Case("gd_pc_hi20", VK_LoongArch_TLS_GD_PC_HI20) .Case("gd_hi20", VK_LoongArch_TLS_GD_HI20) + .Case("call36", VK_LoongArch_CALL36) .Default(VK_LoongArch_Invalid); } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h index 93251f824103..bd828116d7fa 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h @@ -61,6 +61,7 @@ public: VK_LoongArch_TLS_LD_HI20, VK_LoongArch_TLS_GD_PC_HI20, VK_LoongArch_TLS_GD_HI20, + VK_LoongArch_CALL36, VK_LoongArch_Invalid // Must be the last item. }; diff --git a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h index a10401ed1a9a..cbe30ec494c9 100644 --- a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h +++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h @@ -20,7 +20,6 @@ namespace llvm { class M68kSubtarget; -/// This struct provides the information for the target register banks. struct M68kLegalizerInfo : public LegalizerInfo { public: M68kLegalizerInfo(const M68kSubtarget &ST); diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp index c4d7a0dec7f3..158393f02a24 100644 --- a/llvm/lib/Target/M68k/M68kISelLowering.cpp +++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp @@ -2375,7 +2375,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == M68kISD::SUB) { - unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); + unsigned CondCode = CC->getAsZExtVal(); if ((CondCode == M68k::COND_CC || CondCode == M68k::COND_CS) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && @@ -2491,7 +2491,7 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { Cond = Cmp; AddTest = false; } else { - switch (cast<ConstantSDNode>(CC)->getZExtValue()) { + switch (CC->getAsZExtVal()) { default: break; case M68k::COND_VS: diff --git a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp index 660861a5d521..efb23b1a4e3f 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp @@ -308,12 +308,12 @@ static bool isValidIndexedLoad(const LoadSDNode *LD) { switch (VT.getSimpleVT().SimpleTy) { case MVT::i8: - if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1) + if (LD->getOffset()->getAsZExtVal() != 1) return false; break; case MVT::i16: - if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2) + if (LD->getOffset()->getAsZExtVal() != 2) return false; break; diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index d3b59138a5a9..e68904863cfc 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -333,6 +333,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setMinFunctionAlignment(Align(2)); setPrefFunctionAlignment(Align(2)); + setMaxAtomicSizeInBitsSupported(0); } SDValue MSP430TargetLowering::LowerOperation(SDValue Op, @@ -1168,8 +1169,8 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { bool Invert = false; bool Shift = false; bool Convert = true; - switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) { - default: + switch (TargetCC->getAsZExtVal()) { + default: Convert = false; break; case MSP430CC::COND_HS: @@ -1193,7 +1194,7 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // C = ~Z for AND instruction, thus we can put Res = ~(SR & 1), however, // Res = (SR >> 1) & 1 is 1 word shorter. break; - } + } EVT VT = Op.getValueType(); SDValue One = DAG.getConstant(1, dl, VT); if (Convert) { diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp index 39e0658eb70d..283de46e57d5 100644 --- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp +++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp @@ -65,6 +65,7 @@ public: return getTM<MSP430TargetMachine>(); } + void addIRPasses() override; bool addInstSelector() override; void addPreEmitPass() override; }; @@ -81,6 +82,12 @@ MachineFunctionInfo *MSP430TargetMachine::createMachineFunctionInfo( F, STI); } +void MSP430PassConfig::addIRPasses() { + addPass(createAtomicExpandPass()); + + TargetPassConfig::addIRPasses(); +} + bool MSP430PassConfig::addInstSelector() { // Install an instruction selector. addPass(createMSP430ISelDag(getMSP430TargetMachine(), getOptLevel())); diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index 7fcf375aa10b..192ed1cec79a 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -492,7 +492,7 @@ bool MipsFastISel::computeAddress(const Value *Obj, Address &Addr) { unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); TmpOffset += SL->getElementOffset(Idx); } else { - uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + uint64_t S = GTI.getSequentialElementStride(DL); while (true) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 483eba4e4f47..d431d3d91494 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -2042,8 +2042,7 @@ SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const { return Op; SDValue CCNode = CondRes.getOperand(2); - Mips::CondCode CC = - (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue(); + Mips::CondCode CC = (Mips::CondCode)CCNode->getAsZExtVal(); unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T; SDValue BrCode = DAG.getConstant(Opc, DL, MVT::i32); SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32); diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp index 14f26201e6c0..f5e94235859a 100644 --- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp +++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp @@ -330,8 +330,9 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) { verify(*ST.getInstrInfo()); } -bool MipsLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, - MachineInstr &MI) const { +bool MipsLegalizerInfo::legalizeCustom( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { using namespace TargetOpcode; MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.h b/llvm/lib/Target/Mips/MipsLegalizerInfo.h index 05027b718a85..63daebf26470 100644 --- a/llvm/lib/Target/Mips/MipsLegalizerInfo.h +++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.h @@ -25,7 +25,8 @@ class MipsLegalizerInfo : public LegalizerInfo { public: MipsLegalizerInfo(const MipsSubtarget &ST); - bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const override; bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override; diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 0ed87ee0809a..c0e978018919 100644 --- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -76,7 +76,7 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI, } unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const { - uint64_t RegNum = cast<ConstantSDNode>(RegIdx)->getZExtValue(); + uint64_t RegNum = RegIdx->getAsZExtVal(); return Mips::MSACtrlRegClass.getRegister(RegNum); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 815c46edb6fa..7abe984b34e1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2076,7 +2076,7 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { VTs = CurDAG->getVTList(EVTs); } - unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); + unsigned OffsetVal = Offset->getAsZExtVal(); SmallVector<SDValue, 2> Ops; Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32)); @@ -2091,7 +2091,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Offset = N->getOperand(1); - unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); + unsigned OffsetVal = Offset->getAsZExtVal(); MemSDNode *Mem = cast<MemSDNode>(N); // How many elements do we have? @@ -2158,9 +2158,9 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) { SDLoc DL(N); SDValue Chain = N->getOperand(0); SDValue Param = N->getOperand(1); - unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue(); + unsigned ParamVal = Param->getAsZExtVal(); SDValue Offset = N->getOperand(2); - unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); + unsigned OffsetVal = Offset->getAsZExtVal(); MemSDNode *Mem = cast<MemSDNode>(N); SDValue Glue = N->getOperand(N->getNumOperands() - 1); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index e8f36bf50a1b..c65090d915ef 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -854,6 +854,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, computeRegisterProperties(STI.getRegisterInfo()); setMinCmpXchgSizeInBits(32); + setMaxAtomicSizeInBitsSupported(64); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -5811,7 +5812,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); // Get the intrinsic ID - unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue(); + unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); switch (IntrinNo) { default: return; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 8d895762fbe1..fad69f5e80a7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -225,7 +225,8 @@ void NVPTXTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) { AAM.registerFunctionAnalysis<NVPTXAA>(); } -void NVPTXTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { +void NVPTXTargetMachine::registerPassBuilderCallbacks( + PassBuilder &PB, bool PopulateClassToPassNames) { PB.registerPipelineParsingCallback( [](StringRef PassName, FunctionPassManager &PM, ArrayRef<PassBuilder::PipelineElement>) { diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h index cfdd8da9b765..9e6bf929badb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h @@ -69,7 +69,8 @@ public: void registerDefaultAliasAnalyses(AAManager &AAM) override; - void registerPassBuilderCallbacks(PassBuilder &PB) override; + void registerPassBuilderCallbacks(PassBuilder &PB, + bool PopulateClassToPassNames) override; TargetTransformInfo getTargetTransformInfo(const Function &F) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index c73721da46e3..7aa63f9fc0c9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -180,10 +180,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { return {Intrinsic::ceil, FTZ_MustBeOn}; case Intrinsic::nvvm_fabs_d: return {Intrinsic::fabs, FTZ_Any}; - case Intrinsic::nvvm_fabs_f: - return {Intrinsic::fabs, FTZ_MustBeOff}; - case Intrinsic::nvvm_fabs_ftz_f: - return {Intrinsic::fabs, FTZ_MustBeOn}; case Intrinsic::nvvm_floor_d: return {Intrinsic::floor, FTZ_Any}; case Intrinsic::nvvm_floor_f: @@ -264,12 +260,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { return {Intrinsic::minimum, FTZ_MustBeOff, true}; case Intrinsic::nvvm_fmin_ftz_nan_f16x2: return {Intrinsic::minimum, FTZ_MustBeOn, true}; - case Intrinsic::nvvm_round_d: - return {Intrinsic::round, FTZ_Any}; - case Intrinsic::nvvm_round_f: - return {Intrinsic::round, FTZ_MustBeOff}; - case Intrinsic::nvvm_round_ftz_f: - return {Intrinsic::round, FTZ_MustBeOn}; case Intrinsic::nvvm_sqrt_rn_d: return {Intrinsic::sqrt, FTZ_Any}; case Intrinsic::nvvm_sqrt_f: @@ -278,10 +268,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { // the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are // the versions with explicit ftz-ness. return {Intrinsic::sqrt, FTZ_Any}; - case Intrinsic::nvvm_sqrt_rn_f: - return {Intrinsic::sqrt, FTZ_MustBeOff}; - case Intrinsic::nvvm_sqrt_rn_ftz_f: - return {Intrinsic::sqrt, FTZ_MustBeOn}; case Intrinsic::nvvm_trunc_d: return {Intrinsic::trunc, FTZ_Any}; case Intrinsic::nvvm_trunc_f: @@ -316,24 +302,8 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { return {Instruction::UIToFP}; // NVVM intrinsics that map to LLVM binary ops. - case Intrinsic::nvvm_add_rn_d: - return {Instruction::FAdd, FTZ_Any}; - case Intrinsic::nvvm_add_rn_f: - return {Instruction::FAdd, FTZ_MustBeOff}; - case Intrinsic::nvvm_add_rn_ftz_f: - return {Instruction::FAdd, FTZ_MustBeOn}; - case Intrinsic::nvvm_mul_rn_d: - return {Instruction::FMul, FTZ_Any}; - case Intrinsic::nvvm_mul_rn_f: - return {Instruction::FMul, FTZ_MustBeOff}; - case Intrinsic::nvvm_mul_rn_ftz_f: - return {Instruction::FMul, FTZ_MustBeOn}; case Intrinsic::nvvm_div_rn_d: return {Instruction::FDiv, FTZ_Any}; - case Intrinsic::nvvm_div_rn_f: - return {Instruction::FDiv, FTZ_MustBeOff}; - case Intrinsic::nvvm_div_rn_ftz_f: - return {Instruction::FDiv, FTZ_MustBeOn}; // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but // need special handling. @@ -342,10 +312,6 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { // as well. case Intrinsic::nvvm_rcp_rn_d: return {SPC_Reciprocal, FTZ_Any}; - case Intrinsic::nvvm_rcp_rn_f: - return {SPC_Reciprocal, FTZ_MustBeOff}; - case Intrinsic::nvvm_rcp_rn_ftz_f: - return {SPC_Reciprocal, FTZ_MustBeOn}; // We do not currently simplify intrinsics that give an approximate // answer. These include: diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 42f5a4e624c4..56af80f9cede 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -350,7 +350,7 @@ bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) { unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); TmpOffset += SL->getElementOffset(Idx); } else { - uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + uint64_t S = GTI.getSequentialElementStride(DL); for (;;) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index ed96339240d9..26ed74108ec3 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -565,7 +565,7 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) { /// operand. If so Imm will receive the 32-bit value. static bool isInt32Immediate(SDNode *N, unsigned &Imm) { if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) { - Imm = cast<ConstantSDNode>(N)->getZExtValue(); + Imm = N->getAsZExtVal(); return true; } return false; @@ -575,7 +575,7 @@ static bool isInt32Immediate(SDNode *N, unsigned &Imm) { /// operand. If so Imm will receive the 64-bit value. static bool isInt64Immediate(SDNode *N, uint64_t &Imm) { if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) { - Imm = cast<ConstantSDNode>(N)->getZExtValue(); + Imm = N->getAsZExtVal(); return true; } return false; @@ -1500,7 +1500,7 @@ static SDNode *selectI64Imm(SelectionDAG *CurDAG, SDNode *N) { SDLoc dl(N); // Get 64 bit value. - int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue(); + int64_t Imm = N->getAsZExtVal(); if (unsigned MinSize = allUsesTruncate(CurDAG, N)) { uint64_t SextImm = SignExtend64(Imm, MinSize); SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64); @@ -4923,7 +4923,7 @@ bool PPCDAGToDAGISel::trySelectLoopCountIntrinsic(SDNode *N) { SDNode *NewDecrement = CurDAG->getMachineNode(DecrementOpcode, DecrementLoc, MVT::i1, DecrementOps); - unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue(); + unsigned Val = RHS->getAsZExtVal(); bool IsBranchOnTrue = (CC == ISD::SETEQ && Val) || (CC == ISD::SETNE && !Val); unsigned Opcode = IsBranchOnTrue ? PPC::BC : PPC::BCn; @@ -5765,7 +5765,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { break; // If the multiplier fits int16, we can handle it with mulli. - int64_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue(); + int64_t Imm = Op1->getAsZExtVal(); unsigned Shift = llvm::countr_zero<uint64_t>(Imm); if (isInt<16>(Imm) || !Shift) break; @@ -6612,8 +6612,7 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) { // For us to materialize these using one instruction, we must be able to // represent them as signed 16-bit integers. - uint64_t True = cast<ConstantSDNode>(TrueRes)->getZExtValue(), - False = cast<ConstantSDNode>(FalseRes)->getZExtValue(); + uint64_t True = TrueRes->getAsZExtVal(), False = FalseRes->getAsZExtVal(); if (!isInt<16>(True) || !isInt<16>(False)) break; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 8f27e6677afa..235df1880b37 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2566,7 +2566,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { if (LeadingZero) { if (!UniquedVals[Multiple-1].getNode()) return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef - int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue(); + int Val = UniquedVals[Multiple - 1]->getAsZExtVal(); if (Val < 16) // 0,0,0,4 -> vspltisw(4) return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32); } @@ -2635,11 +2635,11 @@ bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { if (!isa<ConstantSDNode>(N)) return false; - Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); + Imm = (int16_t)N->getAsZExtVal(); if (N->getValueType(0) == MVT::i32) - return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); + return Imm == (int32_t)N->getAsZExtVal(); else - return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); + return Imm == (int64_t)N->getAsZExtVal(); } bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { return isIntS16Immediate(Op.getNode(), Imm); @@ -2684,7 +2684,7 @@ bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) { if (!isa<ConstantSDNode>(N)) return false; - Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); + Imm = (int64_t)N->getAsZExtVal(); return isInt<34>(Imm); } bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) { @@ -15580,7 +15580,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR) break; - uint64_t Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + uint64_t Imm = Op2->getAsZExtVal(); // Make sure that the constant is narrow enough to fit in the narrow type. if (!isUInt<32>(Imm)) break; @@ -16795,7 +16795,7 @@ void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I, return; if (!isa<ConstantSDNode>(Ops[1].getNode())) return; - auto IntrinsicID = cast<ConstantSDNode>(Ops[1].getNode())->getZExtValue(); + auto IntrinsicID = Ops[1].getNode()->getAsZExtVal(); if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw && IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap) return; @@ -18430,7 +18430,7 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent, if (Flags & PPC::MOF_RPlusSImm16) { SDValue Op0 = N.getOperand(0); SDValue Op1 = N.getOperand(1); - int16_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue(); + int16_t Imm = Op1->getAsZExtVal(); if (!Align || isAligned(*Align, Imm)) { Disp = DAG.getTargetConstant(Imm, DL, N.getValueType()); Base = Op0; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index 375e63654db1..8a37e40414ee 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -270,12 +270,15 @@ def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>; // Link register def LR : SPR<8, "lr">, DwarfRegNum<[-2, 65]>; -//let Aliases = [LR] in -def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]>; +def LR8 : SPR<8, "lr">, DwarfRegNum<[65, -2]> { + let Aliases = [LR]; +} // Count register def CTR : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>; -def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>; +def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]> { + let Aliases = [CTR]; +} // VRsave register def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>; diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 4759aa951664..d616aaeddf41 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -466,10 +466,6 @@ public: bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } - bool isGPRF64AsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } - - bool isGPRPF64AsFPR() const { return isGPR() && Reg.IsGPRAsFPR; } - static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm, RISCVMCExpr::VariantKind &VK) { if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) { @@ -2039,9 +2035,8 @@ ParseStatus RISCVAsmParser::parseCallSymbol(OperandVector &Operands) { SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size()); - RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL; - if (Identifier.consume_back("@plt")) - Kind = RISCVMCExpr::VK_RISCV_CALL_PLT; + RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL_PLT; + (void)Identifier.consume_back("@plt"); MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext()); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp index 50ed85acdec0..697ad476ff8c 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp @@ -579,7 +579,7 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Select the recommended relocation type R_RISCV_CALL_PLT. if (!Info.Callee.isReg()) - Info.Callee.setTargetFlags(RISCVII::MO_PLT); + Info.Callee.setTargetFlags(RISCVII::MO_CALL); MachineInstrBuilder Call = MIRBuilder diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 079906d1958c..ab8070772fe5 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -113,7 +113,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder(G_BITREVERSE).maxScalar(0, sXLen).lower(); auto &BSWAPActions = getActionDefinitionsBuilder(G_BSWAP); - if (ST.hasStdExtZbb()) + if (ST.hasStdExtZbb() || ST.hasStdExtZbkb()) BSWAPActions.legalFor({sXLen}).clampScalar(0, sXLen, sXLen); else BSWAPActions.maxScalar(0, sXLen).lower(); @@ -411,8 +411,9 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI, return true; } -bool RISCVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, - MachineInstr &MI) const { +bool RISCVLegalizerInfo::legalizeCustom( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; GISelChangeObserver &Observer = Helper.Observer; switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h index 48c36976501f..f3ec6be16734 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h @@ -21,7 +21,6 @@ class GISelChangeObserver; class MachineIRBuilder; class RISCVSubtarget; -/// This class provides the information for the target register banks. class RISCVLegalizerInfo : public LegalizerInfo { const RISCVSubtarget &STI; const unsigned XLen; @@ -30,7 +29,8 @@ class RISCVLegalizerInfo : public LegalizerInfo { public: RISCVLegalizerInfo(const RISCVSubtarget &ST); - bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const override; bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override; diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp index aba2511959af..8d97c5ffd20a 100644 --- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp +++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp @@ -186,30 +186,37 @@ RISCVInstrumentManager::createInstruments(const MCInst &Inst) { } static std::pair<uint8_t, uint8_t> -getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL, - uint8_t SEW) { +getEEWAndEMUL(unsigned Opcode, RISCVII::VLMUL LMUL, uint8_t SEW) { uint8_t EEW; switch (Opcode) { case RISCV::VLM_V: case RISCV::VSM_V: case RISCV::VLE8_V: case RISCV::VSE8_V: + case RISCV::VLSE8_V: + case RISCV::VSSE8_V: EEW = 8; break; case RISCV::VLE16_V: case RISCV::VSE16_V: + case RISCV::VLSE16_V: + case RISCV::VSSE16_V: EEW = 16; break; case RISCV::VLE32_V: case RISCV::VSE32_V: + case RISCV::VLSE32_V: + case RISCV::VSSE32_V: EEW = 32; break; case RISCV::VLE64_V: case RISCV::VSE64_V: + case RISCV::VLSE64_V: + case RISCV::VSSE64_V: EEW = 64; break; default: - llvm_unreachable("Opcode is not a vector unit stride load nor store"); + llvm_unreachable("Could not determine EEW from Opcode"); } auto EMUL = RISCVVType::getSameRatioLMUL(SEW, LMUL, EEW); @@ -218,6 +225,18 @@ getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL, return std::make_pair(EEW, *EMUL); } +bool opcodeHasEEWAndEMULInfo(unsigned short Opcode) { + return Opcode == RISCV::VLM_V || Opcode == RISCV::VSM_V || + Opcode == RISCV::VLE8_V || Opcode == RISCV::VSE8_V || + Opcode == RISCV::VLE16_V || Opcode == RISCV::VSE16_V || + Opcode == RISCV::VLE32_V || Opcode == RISCV::VSE32_V || + Opcode == RISCV::VLE64_V || Opcode == RISCV::VSE64_V || + Opcode == RISCV::VLSE8_V || Opcode == RISCV::VSSE8_V || + Opcode == RISCV::VLSE16_V || Opcode == RISCV::VSSE16_V || + Opcode == RISCV::VLSE32_V || Opcode == RISCV::VSSE32_V || + Opcode == RISCV::VLSE64_V || Opcode == RISCV::VSSE64_V; +} + unsigned RISCVInstrumentManager::getSchedClassID( const MCInstrInfo &MCII, const MCInst &MCI, const llvm::SmallVector<Instrument *> &IVec) const { @@ -249,13 +268,9 @@ unsigned RISCVInstrumentManager::getSchedClassID( uint8_t SEW = SI ? SI->getSEW() : 0; const RISCVVInversePseudosTable::PseudoInfo *RVV = nullptr; - if (Opcode == RISCV::VLM_V || Opcode == RISCV::VSM_V || - Opcode == RISCV::VLE8_V || Opcode == RISCV::VSE8_V || - Opcode == RISCV::VLE16_V || Opcode == RISCV::VSE16_V || - Opcode == RISCV::VLE32_V || Opcode == RISCV::VSE32_V || - Opcode == RISCV::VLE64_V || Opcode == RISCV::VSE64_V) { + if (opcodeHasEEWAndEMULInfo(Opcode)) { RISCVII::VLMUL VLMUL = static_cast<RISCVII::VLMUL>(LMUL); - auto [EEW, EMUL] = getEEWAndEMULForUnitStrideLoadStore(Opcode, VLMUL, SEW); + auto [EEW, EMUL] = getEEWAndEMUL(Opcode, VLMUL, SEW); RVV = RISCVVInversePseudosTable::getBaseInfo(Opcode, EMUL, EEW); } else { // Check if it depends on LMUL and SEW diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 716fb67c5824..7ce08eabdeb6 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -329,16 +329,17 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF, return true; } -bool RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, - int64_t &Value) const { +std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF, + MCAsmLayout &Layout, + int64_t &Value) const { if (LF.isSigned()) - return false; + return std::make_pair(false, false); const MCExpr &Expr = LF.getValue(); if (ULEB128Reloc) { LF.getFixups().push_back( MCFixup::create(0, &Expr, FK_Data_leb128, Expr.getLoc())); } - return Expr.evaluateKnownAbsolute(Value, Layout); + return std::make_pair(Expr.evaluateKnownAbsolute(Value, Layout), false); } // Given a compressed control flow instruction this function returns diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h index 2ad6534ac8bc..902b44bba70f 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -100,8 +100,8 @@ public: bool &WasRelaxed) const override; bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, MCAsmLayout &Layout, bool &WasRelaxed) const override; - bool relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, - int64_t &Value) const override; + std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, MCAsmLayout &Layout, + int64_t &Value) const override; bool writeNopData(raw_ostream &OS, uint64_t Count, const MCSubtargetInfo *STI) const override; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index c32210fc1419..433e2e6f80bd 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -254,7 +254,6 @@ static inline bool isFirstDefTiedToFirstUse(const MCInstrDesc &Desc) { enum { MO_None = 0, MO_CALL = 1, - MO_PLT = 2, MO_LO = 3, MO_HI = 4, MO_PCREL_LO = 5, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index d67351102bc1..64ddae61b1bc 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -41,8 +41,6 @@ void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { if (HasVariant) OS << '%' << getVariantKindName(getKind()) << '('; Expr->print(OS, MAI); - if (Kind == VK_RISCV_CALL_PLT) - OS << "@plt"; if (HasVariant) OS << ')'; } diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp index 0fd514fa87cd..f2bd5118fc07 100644 --- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp +++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp @@ -747,9 +747,6 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym, Kind = RISCVMCExpr::VK_RISCV_None; break; case RISCVII::MO_CALL: - Kind = RISCVMCExpr::VK_RISCV_CALL; - break; - case RISCVII::MO_PLT: Kind = RISCVMCExpr::VK_RISCV_CALL_PLT; break; case RISCVII::MO_LO: diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index 24a13f93af88..103a2e2da7b9 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -109,6 +109,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, return expandRV32ZdinxStore(MBB, MBBI); case RISCV::PseudoRV32ZdinxLD: return expandRV32ZdinxLoad(MBB, MBBI); + case RISCV::PseudoCCMOVGPRNoX0: case RISCV::PseudoCCMOVGPR: case RISCV::PseudoCCADD: case RISCV::PseudoCCSUB: @@ -134,6 +135,9 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoCCSLLIW: case RISCV::PseudoCCSRLIW: case RISCV::PseudoCCSRAIW: + case RISCV::PseudoCCANDN: + case RISCV::PseudoCCORN: + case RISCV::PseudoCCXNOR: return expandCCOp(MBB, MBBI, NextMBBI); case RISCV::PseudoVSETVLI: case RISCV::PseudoVSETVLIX0: @@ -191,7 +195,8 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, Register DestReg = MI.getOperand(0).getReg(); assert(MI.getOperand(4).getReg() == DestReg); - if (MI.getOpcode() == RISCV::PseudoCCMOVGPR) { + if (MI.getOpcode() == RISCV::PseudoCCMOVGPR || + MI.getOpcode() == RISCV::PseudoCCMOVGPRNoX0) { // Add MV. BuildMI(TrueBB, DL, TII->get(RISCV::ADDI), DestReg) .add(MI.getOperand(5)) @@ -225,6 +230,9 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB, case RISCV::PseudoCCSLLIW: NewOpc = RISCV::SLLIW; break; case RISCV::PseudoCCSRLIW: NewOpc = RISCV::SRLIW; break; case RISCV::PseudoCCSRAIW: NewOpc = RISCV::SRAIW; break; + case RISCV::PseudoCCANDN: NewOpc = RISCV::ANDN; break; + case RISCV::PseudoCCORN: NewOpc = RISCV::ORN; break; + case RISCV::PseudoCCXNOR: NewOpc = RISCV::XNOR; break; } BuildMI(TrueBB, DL, TII->get(NewOpc), DestReg) .add(MI.getOperand(5)) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 59b202606dad..bb7a3291085d 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1021,6 +1021,12 @@ def TuneShortForwardBranchOpt def HasShortForwardBranchOpt : Predicate<"Subtarget->hasShortForwardBranchOpt()">; def NoShortForwardBranchOpt : Predicate<"!Subtarget->hasShortForwardBranchOpt()">; +def TuneConditionalCompressedMoveFusion + : SubtargetFeature<"conditional-cmv-fusion", "HasConditionalCompressedMoveFusion", + "true", "Enable branch+c.mv fusion">; +def HasConditionalMoveFusion : Predicate<"Subtarget->hasConditionalMoveFusion()">; +def NoConditionalMoveFusion : Predicate<"!Subtarget->hasConditionalMoveFusion()">; + def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7", "SiFive 7-Series processors", [TuneNoDefaultUnroll, diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp index 5ad1e082344e..1129206800ad 100644 --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -362,7 +362,7 @@ RISCVGatherScatterLowering::determineBaseAndStride(Instruction *Ptr, VecOperand = i; - TypeSize TS = DL->getTypeAllocSize(GTI.getIndexedType()); + TypeSize TS = GTI.getSequentialElementStride(*DL); if (TS.isScalable()) return std::make_pair(nullptr, nullptr); diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index bfa3bf3cc74e..0d8688ba2eae 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -763,14 +763,12 @@ bool RISCVDAGToDAGISel::tryIndexedLoad(SDNode *Node) { return false; EVT LoadVT = Ld->getMemoryVT(); - bool IsPre = (AM == ISD::PRE_INC || AM == ISD::PRE_DEC); - bool IsPost = (AM == ISD::POST_INC || AM == ISD::POST_DEC); + assert((AM == ISD::PRE_INC || AM == ISD::POST_INC) && + "Unexpected addressing mode"); + bool IsPre = AM == ISD::PRE_INC; + bool IsPost = AM == ISD::POST_INC; int64_t Offset = C->getSExtValue(); - // Convert decrements to increments by a negative quantity. - if (AM == ISD::PRE_DEC || AM == ISD::POST_DEC) - Offset = -Offset; - // The constants that can be encoded in the THeadMemIdx instructions // are of the form (sign_extend(imm5) << imm2). int64_t Shift; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 03a59f8a8b57..0a1a466af591 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -814,8 +814,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction({ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}, VT, Custom); setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom); - setOperationAction( - {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, Legal); + setOperationAction({ISD::AVGFLOORU, ISD::SADDSAT, ISD::UADDSAT, + ISD::SSUBSAT, ISD::USUBSAT}, + VT, Legal); // Integer VTs are lowered as a series of "RISCVISD::TRUNCATE_VECTOR_VL" // nodes which truncate by one power of two at a time. @@ -1184,9 +1185,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV()) setOperationAction({ISD::MULHS, ISD::MULHU}, VT, Custom); - setOperationAction( - {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}, VT, - Custom); + setOperationAction({ISD::AVGFLOORU, ISD::SADDSAT, ISD::UADDSAT, + ISD::SSUBSAT, ISD::USUBSAT}, + VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::SELECT_CC, VT, Expand); @@ -1350,8 +1351,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, } if (Subtarget.hasVendorXTHeadMemIdx()) { - for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::POST_DEC; - ++im) { + for (unsigned im : {ISD::PRE_INC, ISD::POST_INC}) { setIndexedLoadAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedLoadAction(im, MVT::i16, Legal); @@ -1374,8 +1374,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setPrefLoopAlignment(Subtarget.getPrefLoopAlignment()); setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, - ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::MUL, - ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT}); + ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND, + ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT}); if (Subtarget.is64Bit()) setTargetDAGCombine(ISD::SRA); @@ -2711,11 +2711,19 @@ InstructionCost RISCVTargetLowering::getVRGatherVICost(MVT VT) const { return getLMULCost(VT); } -/// Return the cost of a vslidedown.vi/vx or vslideup.vi/vx instruction +/// Return the cost of a vslidedown.vx or vslideup.vx instruction +/// for the type VT. (This does not cover the vslide1up or vslide1down +/// variants.) Slides may be linear in the number of vregs implied by LMUL, +/// or may track the vrgather.vv cost. It is implementation-dependent. +InstructionCost RISCVTargetLowering::getVSlideVXCost(MVT VT) const { + return getLMULCost(VT); +} + +/// Return the cost of a vslidedown.vi or vslideup.vi instruction /// for the type VT. (This does not cover the vslide1up or vslide1down /// variants.) Slides may be linear in the number of vregs implied by LMUL, /// or may track the vrgather.vv cost. It is implementation-dependent. -InstructionCost RISCVTargetLowering::getVSlideCost(MVT VT) const { +InstructionCost RISCVTargetLowering::getVSlideVICost(MVT VT) const { return getLMULCost(VT); } @@ -2811,8 +2819,8 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, SDValue SplatZero = DAG.getNode( RISCVISD::VMV_V_X_VL, DL, DstContainerVT, DAG.getUNDEF(DstContainerVT), DAG.getConstant(0, DL, Subtarget.getXLenVT()), VL); - Res = DAG.getNode(RISCVISD::VSELECT_VL, DL, DstContainerVT, IsNan, SplatZero, - Res, VL); + Res = DAG.getNode(RISCVISD::VMERGE_VL, DL, DstContainerVT, IsNan, SplatZero, + Res, DAG.getUNDEF(DstContainerVT), VL); if (DstVT.isFixedLengthVector()) Res = convertFromScalableVector(DstVT, Res, DAG, Subtarget); @@ -3489,7 +3497,7 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, for (unsigned I = 0; I < NumElts;) { SDValue V = Op.getOperand(I); - bool BitValue = !V.isUndef() && cast<ConstantSDNode>(V)->getZExtValue(); + bool BitValue = !V.isUndef() && V->getAsZExtVal(); Bits |= ((uint64_t)BitValue << BitPos); ++BitPos; ++I; @@ -3620,8 +3628,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, for (const auto &OpIdx : enumerate(Op->op_values())) { const auto &SeqV = OpIdx.value(); if (!SeqV.isUndef()) - SplatValue |= ((cast<ConstantSDNode>(SeqV)->getZExtValue() & EltMask) - << (OpIdx.index() * EltBitSize)); + SplatValue |= + ((SeqV->getAsZExtVal() & EltMask) << (OpIdx.index() * EltBitSize)); } // On RV64, sign-extend from 32 to 64 bits where possible in order to @@ -3650,10 +3658,10 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, // would require bit-manipulation instructions to construct the splat value. SmallVector<SDValue> Sequence; const auto *BV = cast<BuildVectorSDNode>(Op); - if (VT.isInteger() && EltBitSize < 64 && + if (VT.isInteger() && EltBitSize < Subtarget.getELen() && ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && BV->getRepeatedSequence(Sequence) && - (Sequence.size() * EltBitSize) <= 64) { + (Sequence.size() * EltBitSize) <= Subtarget.getELen()) { unsigned SeqLen = Sequence.size(); MVT ViaIntVT = MVT::getIntegerVT(EltBitSize * SeqLen); assert((ViaIntVT == MVT::i16 || ViaIntVT == MVT::i32 || @@ -3676,8 +3684,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG, // vector type. for (const auto &SeqV : Sequence) { if (!SeqV.isUndef()) - SplatValue |= ((cast<ConstantSDNode>(SeqV)->getZExtValue() & EltMask) - << (EltIdx * EltBitSize)); + SplatValue |= + ((SeqV->getAsZExtVal() & EltMask) << (EltIdx * EltBitSize)); EltIdx++; } @@ -3938,8 +3946,7 @@ static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Passthru, (isa<RegisterSDNode>(VL) && cast<RegisterSDNode>(VL)->getReg() == RISCV::X0)) NewVL = DAG.getRegister(RISCV::X0, MVT::i32); - else if (isa<ConstantSDNode>(VL) && - isUInt<4>(cast<ConstantSDNode>(VL)->getZExtValue())) + else if (isa<ConstantSDNode>(VL) && isUInt<4>(VL->getAsZExtVal())) NewVL = DAG.getNode(ISD::ADD, DL, VL.getValueType(), VL, VL); if (NewVL) { @@ -5401,8 +5408,8 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG, SDValue XIsNonNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(), {X, X, DAG.getCondCode(ISD::SETOEQ), DAG.getUNDEF(ContainerVT), Mask, VL}); - NewY = - DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, XIsNonNan, Y, X, VL); + NewY = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, XIsNonNan, Y, X, + DAG.getUNDEF(ContainerVT), VL); } SDValue NewX = X; @@ -5410,8 +5417,8 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG, SDValue YIsNonNan = DAG.getNode(RISCVISD::SETCC_VL, DL, Mask.getValueType(), {Y, Y, DAG.getCondCode(ISD::SETOEQ), DAG.getUNDEF(ContainerVT), Mask, VL}); - NewX = - DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, YIsNonNan, X, Y, VL); + NewX = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, YIsNonNan, X, Y, + DAG.getUNDEF(ContainerVT), VL); } unsigned Opc = @@ -5458,6 +5465,7 @@ static unsigned getRISCVVLOp(SDValue Op) { OP_CASE(UADDSAT) OP_CASE(SSUBSAT) OP_CASE(USUBSAT) + OP_CASE(AVGFLOORU) OP_CASE(FADD) OP_CASE(FSUB) OP_CASE(FMUL) @@ -5528,7 +5536,6 @@ static unsigned getRISCVVLOp(SDValue Op) { return RISCVISD::VMXOR_VL; return RISCVISD::XOR_VL; case ISD::VP_SELECT: - return RISCVISD::VSELECT_VL; case ISD::VP_MERGE: return RISCVISD::VMERGE_VL; case ISD::VP_ASHR: @@ -6453,6 +6460,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, !Subtarget.hasVInstructionsF16())) return SplitVectorOp(Op, DAG); [[fallthrough]]; + case ISD::AVGFLOORU: case ISD::SADDSAT: case ISD::UADDSAT: case ISD::SSUBSAT: @@ -6914,7 +6922,7 @@ static SDValue combineSelectToBinOp(SDNode *N, SelectionDAG &DAG, MVT VT = N->getSimpleValueType(0); SDLoc DL(N); - if (!Subtarget.hasShortForwardBranchOpt()) { + if (!Subtarget.hasConditionalMoveFusion()) { // (select c, -1, y) -> -c | y if (isAllOnesConstant(TrueV)) { SDValue Neg = DAG.getNegative(CondV, DL, VT); @@ -7078,7 +7086,7 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select c, t, f) -> (or (czero_eqz t, c), (czero_nez f, c)) // Unless we have the short forward branch optimization. - if (!Subtarget.hasShortForwardBranchOpt()) + if (!Subtarget.hasConditionalMoveFusion()) return DAG.getNode( ISD::OR, DL, VT, DAG.getNode(RISCVISD::CZERO_EQZ, DL, VT, TrueV, CondV), @@ -7456,8 +7464,9 @@ SDValue RISCVTargetLowering::lowerVectorMaskExt(SDValue Op, SelectionDAG &DAG, DAG.getUNDEF(ContainerVT), SplatZero, VL); SplatTrueVal = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), SplatTrueVal, VL); - SDValue Select = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC, - SplatTrueVal, SplatZero, VL); + SDValue Select = + DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, SplatTrueVal, + SplatZero, DAG.getUNDEF(ContainerVT), VL); return convertFromScalableVector(VecVT, Select, DAG, Subtarget); } @@ -7906,8 +7915,7 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, // Use tail agnostic policy if Idx is the last index of Vec. unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED; if (VecVT.isFixedLengthVector() && isa<ConstantSDNode>(Idx) && - cast<ConstantSDNode>(Idx)->getZExtValue() + 1 == - VecVT.getVectorNumElements()) + Idx->getAsZExtVal() + 1 == VecVT.getVectorNumElements()) Policy = RISCVII::TAIL_AGNOSTIC; SDValue Slideup = getVSlideup(DAG, Subtarget, DL, ContainerVT, Vec, ValInVec, Idx, Mask, InsertVL, Policy); @@ -8167,7 +8175,7 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG, const auto [MinVLMAX, MaxVLMAX] = RISCVTargetLowering::computeVLMAXBounds(VT, Subtarget); - uint64_t AVLInt = cast<ConstantSDNode>(AVL)->getZExtValue(); + uint64_t AVLInt = AVL->getAsZExtVal(); if (AVLInt <= MinVLMAX) { I32VL = DAG.getConstant(2 * AVLInt, DL, XLenVT); } else if (AVLInt >= 2 * MaxVLMAX) { @@ -8233,15 +8241,14 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG, SDValue Mask = Operands[NumOps - 3]; SDValue MaskedOff = Operands[1]; // Assume Policy operand is the last operand. - uint64_t Policy = - cast<ConstantSDNode>(Operands[NumOps - 1])->getZExtValue(); + uint64_t Policy = Operands[NumOps - 1]->getAsZExtVal(); // We don't need to select maskedoff if it's undef. if (MaskedOff.isUndef()) return Vec; // TAMU if (Policy == RISCVII::TAIL_AGNOSTIC) - return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, Mask, Vec, MaskedOff, - AVL); + return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff, + DAG.getUNDEF(VT), AVL); // TUMA or TUMU: Currently we always emit tumu policy regardless of tuma. // It's fine because vmerge does not care mask policy. return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff, @@ -8489,8 +8496,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getNode(RISCVISD::SETCC_VL, DL, MaskVT, {VID, SplattedIdx, DAG.getCondCode(ISD::SETEQ), DAG.getUNDEF(MaskVT), Mask, VL}); - return DAG.getNode(RISCVISD::VSELECT_VL, DL, VT, SelectCond, SplattedVal, - Vec, VL); + return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, SelectCond, SplattedVal, + Vec, DAG.getUNDEF(VT), VL); } // EGS * EEW >= 128 bits case Intrinsic::riscv_vaesdf_vv: @@ -10243,8 +10250,8 @@ SDValue RISCVTargetLowering::lowerFixedLengthVectorSelectToRVV( SDLoc DL(Op); SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second; - SDValue Select = - DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, CC, Op1, Op2, VL); + SDValue Select = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, CC, Op1, + Op2, DAG.getUNDEF(ContainerVT), VL); return convertFromScalableVector(VT, Select, DAG, Subtarget); } @@ -10327,9 +10334,14 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const { Ops.push_back(DAG.getUNDEF(ContainerVT)); } else if (ISD::getVPExplicitVectorLengthIdx(Op.getOpcode()) == OpIdx.index()) { - // For VP_MERGE, copy the false operand instead of an undef value. - assert(Op.getOpcode() == ISD::VP_MERGE); - Ops.push_back(Ops.back()); + if (Op.getOpcode() == ISD::VP_MERGE) { + // For VP_MERGE, copy the false operand instead of an undef value. + Ops.push_back(Ops.back()); + } else { + assert(Op.getOpcode() == ISD::VP_SELECT); + // For VP_SELECT, add an undef value. + Ops.push_back(DAG.getUNDEF(ContainerVT)); + } } } // Pass through operands which aren't fixed-length vectors. @@ -10379,8 +10391,8 @@ SDValue RISCVTargetLowering::lowerVPExtMaskOp(SDValue Op, SDValue Splat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), SplatValue, VL); - SDValue Result = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Src, - Splat, ZeroSplat, VL); + SDValue Result = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Src, Splat, + ZeroSplat, DAG.getUNDEF(ContainerVT), VL); if (!VT.isFixedLengthVector()) return Result; return convertFromScalableVector(VT, Result, DAG, Subtarget); @@ -10508,8 +10520,8 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op, RISCVISDExtOpc == RISCVISD::VZEXT_VL ? 1 : -1, DL, XLenVT); SDValue OneSplat = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntVT, DAG.getUNDEF(IntVT), One, VL); - Src = DAG.getNode(RISCVISD::VSELECT_VL, DL, IntVT, Src, OneSplat, - ZeroSplat, VL); + Src = DAG.getNode(RISCVISD::VMERGE_VL, DL, IntVT, Src, OneSplat, + ZeroSplat, DAG.getUNDEF(IntVT), VL); } else if (DstEltSize > (2 * SrcEltSize)) { // Widen before converting. MVT IntVT = MVT::getVectorVT(MVT::getIntegerVT(DstEltSize / 2), @@ -10633,8 +10645,8 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op, SDValue SplatZeroOp1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), DAG.getConstant(0, DL, XLenVT), EVL1); - Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op1, SplatOneOp1, - SplatZeroOp1, EVL1); + Op1 = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Op1, SplatOneOp1, + SplatZeroOp1, DAG.getUNDEF(ContainerVT), EVL1); SDValue SplatOneOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), @@ -10642,8 +10654,8 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op, SDValue SplatZeroOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), DAG.getConstant(0, DL, XLenVT), EVL2); - Op2 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op2, SplatOneOp2, - SplatZeroOp2, EVL2); + Op2 = DAG.getNode(RISCVISD::VMERGE_VL, DL, ContainerVT, Op2, SplatOneOp2, + SplatZeroOp2, DAG.getUNDEF(ContainerVT), EVL2); } int64_t ImmValue = cast<ConstantSDNode>(Offset)->getSExtValue(); @@ -10713,8 +10725,8 @@ RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op, SDValue SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IndicesVT, DAG.getUNDEF(IndicesVT), DAG.getConstant(0, DL, XLenVT), EVL); - Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, IndicesVT, Op1, SplatOne, - SplatZero, EVL); + Op1 = DAG.getNode(RISCVISD::VMERGE_VL, DL, IndicesVT, Op1, SplatOne, + SplatZero, DAG.getUNDEF(IndicesVT), EVL); } unsigned EltSize = GatherVT.getScalarSizeInBits(); @@ -12197,7 +12209,7 @@ static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, if (VT.isVector()) return SDValue(); - if (!Subtarget.hasShortForwardBranchOpt()) { + if (!Subtarget.hasConditionalMoveFusion()) { // (select cond, x, (and x, c)) has custom lowering with Zicond. if ((!Subtarget.hasStdExtZicond() && !Subtarget.hasVendorXVentanaCondOps()) || @@ -12850,9 +12862,9 @@ struct CombineResult; /// Helper class for folding sign/zero extensions. /// In particular, this class is used for the following combines: -/// add | add_vl -> vwadd(u) | vwadd(u)_w -/// sub | sub_vl -> vwsub(u) | vwsub(u)_w -/// mul | mul_vl -> vwmul(u) | vwmul_su +/// add_vl -> vwadd(u) | vwadd(u)_w +/// sub_vl -> vwsub(u) | vwsub(u)_w +/// mul_vl -> vwmul(u) | vwmul_su /// /// An object of this class represents an operand of the operation we want to /// combine. @@ -12897,8 +12909,6 @@ struct NodeExtensionHelper { /// E.g., for zext(a), this would return a. SDValue getSource() const { switch (OrigOperand.getOpcode()) { - case ISD::ZERO_EXTEND: - case ISD::SIGN_EXTEND: case RISCVISD::VSEXT_VL: case RISCVISD::VZEXT_VL: return OrigOperand.getOperand(0); @@ -12915,8 +12925,7 @@ struct NodeExtensionHelper { /// Get or create a value that can feed \p Root with the given extension \p /// SExt. If \p SExt is std::nullopt, this returns the source of this operand. /// \see ::getSource(). - SDValue getOrCreateExtendedOp(SDNode *Root, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget, + SDValue getOrCreateExtendedOp(const SDNode *Root, SelectionDAG &DAG, std::optional<bool> SExt) const { if (!SExt.has_value()) return OrigOperand; @@ -12931,10 +12940,8 @@ struct NodeExtensionHelper { // If we need an extension, we should be changing the type. SDLoc DL(Root); - auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget); + auto [Mask, VL] = getMaskAndVL(Root); switch (OrigOperand.getOpcode()) { - case ISD::ZERO_EXTEND: - case ISD::SIGN_EXTEND: case RISCVISD::VSEXT_VL: case RISCVISD::VZEXT_VL: return DAG.getNode(ExtOpc, DL, NarrowVT, Source, Mask, VL); @@ -12974,15 +12981,12 @@ struct NodeExtensionHelper { /// \pre \p Opcode represents a supported root (\see ::isSupportedRoot()). static unsigned getSameExtensionOpcode(unsigned Opcode, bool IsSExt) { switch (Opcode) { - case ISD::ADD: case RISCVISD::ADD_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: return IsSExt ? RISCVISD::VWADD_VL : RISCVISD::VWADDU_VL; - case ISD::MUL: case RISCVISD::MUL_VL: return IsSExt ? RISCVISD::VWMUL_VL : RISCVISD::VWMULU_VL; - case ISD::SUB: case RISCVISD::SUB_VL: case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: @@ -12995,8 +12999,7 @@ struct NodeExtensionHelper { /// Get the opcode to materialize \p Opcode(sext(a), zext(b)) -> /// newOpcode(a, b). static unsigned getSUOpcode(unsigned Opcode) { - assert((Opcode == RISCVISD::MUL_VL || Opcode == ISD::MUL) && - "SU is only supported for MUL"); + assert(Opcode == RISCVISD::MUL_VL && "SU is only supported for MUL"); return RISCVISD::VWMULSU_VL; } @@ -13004,10 +13007,8 @@ struct NodeExtensionHelper { /// newOpcode(a, b). static unsigned getWOpcode(unsigned Opcode, bool IsSExt) { switch (Opcode) { - case ISD::ADD: case RISCVISD::ADD_VL: return IsSExt ? RISCVISD::VWADD_W_VL : RISCVISD::VWADDU_W_VL; - case ISD::SUB: case RISCVISD::SUB_VL: return IsSExt ? RISCVISD::VWSUB_W_VL : RISCVISD::VWSUBU_W_VL; default: @@ -13017,33 +13018,19 @@ struct NodeExtensionHelper { using CombineToTry = std::function<std::optional<CombineResult>( SDNode * /*Root*/, const NodeExtensionHelper & /*LHS*/, - const NodeExtensionHelper & /*RHS*/, SelectionDAG &, - const RISCVSubtarget &)>; + const NodeExtensionHelper & /*RHS*/)>; /// Check if this node needs to be fully folded or extended for all users. bool needToPromoteOtherUsers() const { return EnforceOneUse; } /// Helper method to set the various fields of this struct based on the /// type of \p Root. - void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { + void fillUpExtensionSupport(SDNode *Root, SelectionDAG &DAG) { SupportsZExt = false; SupportsSExt = false; EnforceOneUse = true; CheckMask = true; - unsigned Opc = OrigOperand.getOpcode(); - switch (Opc) { - case ISD::ZERO_EXTEND: - case ISD::SIGN_EXTEND: { - if (OrigOperand.getValueType().isVector()) { - SupportsZExt = Opc == ISD::ZERO_EXTEND; - SupportsSExt = Opc == ISD::SIGN_EXTEND; - SDLoc DL(Root); - MVT VT = Root->getSimpleValueType(0); - std::tie(Mask, VL) = getDefaultScalableVLOps(VT, DL, DAG, Subtarget); - } - break; - } + switch (OrigOperand.getOpcode()) { case RISCVISD::VZEXT_VL: SupportsZExt = true; Mask = OrigOperand.getOperand(1); @@ -13099,16 +13086,8 @@ struct NodeExtensionHelper { } /// Check if \p Root supports any extension folding combines. - static bool isSupportedRoot(const SDNode *Root, const SelectionDAG &DAG) { + static bool isSupportedRoot(const SDNode *Root) { switch (Root->getOpcode()) { - case ISD::ADD: - case ISD::SUB: - case ISD::MUL: { - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (!TLI.isTypeLegal(Root->getValueType(0))) - return false; - return Root->getValueType(0).isScalableVector(); - } case RISCVISD::ADD_VL: case RISCVISD::MUL_VL: case RISCVISD::VWADD_W_VL: @@ -13123,10 +13102,9 @@ struct NodeExtensionHelper { } /// Build a NodeExtensionHelper for \p Root.getOperand(\p OperandIdx). - NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { - assert(isSupportedRoot(Root, DAG) && "Trying to build an helper with an " - "unsupported root"); + NodeExtensionHelper(SDNode *Root, unsigned OperandIdx, SelectionDAG &DAG) { + assert(isSupportedRoot(Root) && "Trying to build an helper with an " + "unsupported root"); assert(OperandIdx < 2 && "Requesting something else than LHS or RHS"); OrigOperand = Root->getOperand(OperandIdx); @@ -13142,7 +13120,7 @@ struct NodeExtensionHelper { SupportsZExt = Opc == RISCVISD::VWADDU_W_VL || Opc == RISCVISD::VWSUBU_W_VL; SupportsSExt = !SupportsZExt; - std::tie(Mask, VL) = getMaskAndVL(Root, DAG, Subtarget); + std::tie(Mask, VL) = getMaskAndVL(Root); CheckMask = true; // There's no existing extension here, so we don't have to worry about // making sure it gets removed. @@ -13151,7 +13129,7 @@ struct NodeExtensionHelper { } [[fallthrough]]; default: - fillUpExtensionSupport(Root, DAG, Subtarget); + fillUpExtensionSupport(Root, DAG); break; } } @@ -13167,27 +13145,14 @@ struct NodeExtensionHelper { } /// Helper function to get the Mask and VL from \p Root. - static std::pair<SDValue, SDValue> - getMaskAndVL(const SDNode *Root, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { - assert(isSupportedRoot(Root, DAG) && "Unexpected root"); - switch (Root->getOpcode()) { - case ISD::ADD: - case ISD::SUB: - case ISD::MUL: { - SDLoc DL(Root); - MVT VT = Root->getSimpleValueType(0); - return getDefaultScalableVLOps(VT, DL, DAG, Subtarget); - } - default: - return std::make_pair(Root->getOperand(3), Root->getOperand(4)); - } + static std::pair<SDValue, SDValue> getMaskAndVL(const SDNode *Root) { + assert(isSupportedRoot(Root) && "Unexpected root"); + return std::make_pair(Root->getOperand(3), Root->getOperand(4)); } /// Check if the Mask and VL of this operand are compatible with \p Root. - bool areVLAndMaskCompatible(SDNode *Root, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) const { - auto [Mask, VL] = getMaskAndVL(Root, DAG, Subtarget); + bool areVLAndMaskCompatible(const SDNode *Root) const { + auto [Mask, VL] = getMaskAndVL(Root); return isMaskCompatible(Mask) && isVLCompatible(VL); } @@ -13195,14 +13160,11 @@ struct NodeExtensionHelper { /// foldings that are supported by this class. static bool isCommutative(const SDNode *N) { switch (N->getOpcode()) { - case ISD::ADD: - case ISD::MUL: case RISCVISD::ADD_VL: case RISCVISD::MUL_VL: case RISCVISD::VWADD_W_VL: case RISCVISD::VWADDU_W_VL: return true; - case ISD::SUB: case RISCVISD::SUB_VL: case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: @@ -13247,25 +13209,14 @@ struct CombineResult { /// Return a value that uses TargetOpcode and that can be used to replace /// Root. /// The actual replacement is *not* done in that method. - SDValue materialize(SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) const { + SDValue materialize(SelectionDAG &DAG) const { SDValue Mask, VL, Merge; - std::tie(Mask, VL) = - NodeExtensionHelper::getMaskAndVL(Root, DAG, Subtarget); - switch (Root->getOpcode()) { - default: - Merge = Root->getOperand(2); - break; - case ISD::ADD: - case ISD::SUB: - case ISD::MUL: - Merge = DAG.getUNDEF(Root->getValueType(0)); - break; - } + std::tie(Mask, VL) = NodeExtensionHelper::getMaskAndVL(Root); + Merge = Root->getOperand(2); return DAG.getNode(TargetOpcode, SDLoc(Root), Root->getValueType(0), - LHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtLHS), - RHS.getOrCreateExtendedOp(Root, DAG, Subtarget, SExtRHS), - Merge, Mask, VL); + LHS.getOrCreateExtendedOp(Root, DAG, SExtLHS), + RHS.getOrCreateExtendedOp(Root, DAG, SExtRHS), Merge, + Mask, VL); } }; @@ -13282,16 +13233,15 @@ struct CombineResult { static std::optional<CombineResult> canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS, const NodeExtensionHelper &RHS, bool AllowSExt, - bool AllowZExt, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { + bool AllowZExt) { assert((AllowSExt || AllowZExt) && "Forgot to set what you want?"); - if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) || - !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) + if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root)) return std::nullopt; if (AllowZExt && LHS.SupportsZExt && RHS.SupportsZExt) return CombineResult(NodeExtensionHelper::getSameExtensionOpcode( Root->getOpcode(), /*IsSExt=*/false), - Root, LHS, /*SExtLHS=*/false, RHS, /*SExtRHS=*/false); + Root, LHS, /*SExtLHS=*/false, RHS, + /*SExtRHS=*/false); if (AllowSExt && LHS.SupportsSExt && RHS.SupportsSExt) return CombineResult(NodeExtensionHelper::getSameExtensionOpcode( Root->getOpcode(), /*IsSExt=*/true), @@ -13308,10 +13258,9 @@ canFoldToVWWithSameExtensionImpl(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional<CombineResult> canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { + const NodeExtensionHelper &RHS) { return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true, - /*AllowZExt=*/true, DAG, Subtarget); + /*AllowZExt=*/true); } /// Check if \p Root follows a pattern Root(LHS, ext(RHS)) @@ -13320,9 +13269,8 @@ canFoldToVWWithSameExtension(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional<CombineResult> canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { - if (!RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) + const NodeExtensionHelper &RHS) { + if (!RHS.areVLAndMaskCompatible(Root)) return std::nullopt; // FIXME: Is it useful to form a vwadd.wx or vwsub.wx if it removes a scalar @@ -13346,10 +13294,9 @@ canFoldToVW_W(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional<CombineResult> canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { + const NodeExtensionHelper &RHS) { return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/true, - /*AllowZExt=*/false, DAG, Subtarget); + /*AllowZExt=*/false); } /// Check if \p Root follows a pattern Root(zext(LHS), zext(RHS)) @@ -13358,10 +13305,9 @@ canFoldToVWWithSEXT(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional<CombineResult> canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { + const NodeExtensionHelper &RHS) { return canFoldToVWWithSameExtensionImpl(Root, LHS, RHS, /*AllowSExt=*/false, - /*AllowZExt=*/true, DAG, Subtarget); + /*AllowZExt=*/true); } /// Check if \p Root follows a pattern Root(sext(LHS), zext(RHS)) @@ -13370,13 +13316,10 @@ canFoldToVWWithZEXT(SDNode *Root, const NodeExtensionHelper &LHS, /// can be used to apply the pattern. static std::optional<CombineResult> canFoldToVW_SU(SDNode *Root, const NodeExtensionHelper &LHS, - const NodeExtensionHelper &RHS, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { - + const NodeExtensionHelper &RHS) { if (!LHS.SupportsSExt || !RHS.SupportsZExt) return std::nullopt; - if (!LHS.areVLAndMaskCompatible(Root, DAG, Subtarget) || - !RHS.areVLAndMaskCompatible(Root, DAG, Subtarget)) + if (!LHS.areVLAndMaskCompatible(Root) || !RHS.areVLAndMaskCompatible(Root)) return std::nullopt; return CombineResult(NodeExtensionHelper::getSUOpcode(Root->getOpcode()), Root, LHS, /*SExtLHS=*/true, RHS, /*SExtRHS=*/false); @@ -13386,8 +13329,6 @@ SmallVector<NodeExtensionHelper::CombineToTry> NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { SmallVector<CombineToTry> Strategies; switch (Root->getOpcode()) { - case ISD::ADD: - case ISD::SUB: case RISCVISD::ADD_VL: case RISCVISD::SUB_VL: // add|sub -> vwadd(u)|vwsub(u) @@ -13395,7 +13336,6 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { // add|sub -> vwadd(u)_w|vwsub(u)_w Strategies.push_back(canFoldToVW_W); break; - case ISD::MUL: case RISCVISD::MUL_VL: // mul -> vwmul(u) Strategies.push_back(canFoldToVWWithSameExtension); @@ -13426,14 +13366,12 @@ NodeExtensionHelper::getSupportedFoldings(const SDNode *Root) { /// mul_vl -> vwmul(u) | vwmul_su /// vwadd_w(u) -> vwadd(u) /// vwub_w(u) -> vwadd(u) -static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const RISCVSubtarget &Subtarget) { +static SDValue +combineBinOp_VLToVWBinOp_VL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; - if (!NodeExtensionHelper::isSupportedRoot(N, DAG)) - return SDValue(); - + assert(NodeExtensionHelper::isSupportedRoot(N) && + "Shouldn't have called this method"); SmallVector<SDNode *> Worklist; SmallSet<SDNode *, 8> Inserted; Worklist.push_back(N); @@ -13442,11 +13380,11 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, while (!Worklist.empty()) { SDNode *Root = Worklist.pop_back_val(); - if (!NodeExtensionHelper::isSupportedRoot(Root, DAG)) + if (!NodeExtensionHelper::isSupportedRoot(Root)) return SDValue(); - NodeExtensionHelper LHS(N, 0, DAG, Subtarget); - NodeExtensionHelper RHS(N, 1, DAG, Subtarget); + NodeExtensionHelper LHS(N, 0, DAG); + NodeExtensionHelper RHS(N, 1, DAG); auto AppendUsersIfNeeded = [&Worklist, &Inserted](const NodeExtensionHelper &Op) { if (Op.needToPromoteOtherUsers()) { @@ -13473,8 +13411,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, for (NodeExtensionHelper::CombineToTry FoldingStrategy : FoldingStrategies) { - std::optional<CombineResult> Res = - FoldingStrategy(N, LHS, RHS, DAG, Subtarget); + std::optional<CombineResult> Res = FoldingStrategy(N, LHS, RHS); if (Res) { Matched = true; CombinesToApply.push_back(*Res); @@ -13503,7 +13440,7 @@ static SDValue combineBinOp_VLToVWBinOp_VL(SDNode *N, SmallVector<std::pair<SDValue, SDValue>> ValuesToReplace; ValuesToReplace.reserve(CombinesToApply.size()); for (CombineResult Res : CombinesToApply) { - SDValue NewValue = Res.materialize(DAG, Subtarget); + SDValue NewValue = Res.materialize(DAG); if (!InputRootReplacement) { assert(Res.Root == N && "First element is expected to be the current node"); @@ -14503,7 +14440,7 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = useInversedSetcc(N, DAG, Subtarget)) return V; - if (Subtarget.hasShortForwardBranchOpt()) + if (Subtarget.hasConditionalMoveFusion()) return SDValue(); SDValue TrueVal = N->getOperand(1); @@ -14775,20 +14712,13 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG, static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - - assert(N->getOpcode() == RISCVISD::ADD_VL || N->getOpcode() == ISD::ADD); - - if (N->getValueType(0).isFixedLengthVector()) - return SDValue(); - + assert(N->getOpcode() == RISCVISD::ADD_VL); SDValue Addend = N->getOperand(0); SDValue MulOp = N->getOperand(1); + SDValue AddMergeOp = N->getOperand(2); - if (N->getOpcode() == RISCVISD::ADD_VL) { - SDValue AddMergeOp = N->getOperand(2); - if (!AddMergeOp.isUndef()) - return SDValue(); - } + if (!AddMergeOp.isUndef()) + return SDValue(); auto IsVWMulOpc = [](unsigned Opc) { switch (Opc) { @@ -14812,16 +14742,8 @@ static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG, if (!MulMergeOp.isUndef()) return SDValue(); - auto [AddMask, AddVL] = [](SDNode *N, SelectionDAG &DAG, - const RISCVSubtarget &Subtarget) { - if (N->getOpcode() == ISD::ADD) { - SDLoc DL(N); - return getDefaultScalableVLOps(N->getSimpleValueType(0), DL, DAG, - Subtarget); - } - return std::make_pair(N->getOperand(3), N->getOperand(4)); - }(N, DAG, Subtarget); - + SDValue AddMask = N->getOperand(3); + SDValue AddVL = N->getOperand(4); SDValue MulMask = MulOp.getOperand(3); SDValue MulVL = MulOp.getOperand(4); @@ -15087,18 +15009,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getNode(ISD::AND, DL, VT, NewFMV, DAG.getConstant(~SignBit, DL, VT)); } - case ISD::ADD: { - if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) - return V; - if (SDValue V = combineToVWMACC(N, DAG, Subtarget)) - return V; + case ISD::ADD: return performADDCombine(N, DAG, Subtarget); - } - case ISD::SUB: { - if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) - return V; + case ISD::SUB: return performSUBCombine(N, DAG, Subtarget); - } case ISD::AND: return performANDCombine(N, DCI, Subtarget); case ISD::OR: @@ -15106,8 +15020,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case ISD::XOR: return performXORCombine(N, DAG, Subtarget); case ISD::MUL: - if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) - return V; return performMULCombine(N, DAG); case ISD::FADD: case ISD::UMAX: @@ -15266,7 +15178,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0), {LHS, RHS, CC, TrueV, FalseV}); - if (!Subtarget.hasShortForwardBranchOpt()) { + if (!Subtarget.hasConditionalMoveFusion()) { // (select c, -1, y) -> -c | y if (isAllOnesConstant(TrueV)) { SDValue C = DAG.getSetCC(DL, VT, LHS, RHS, CCVal); @@ -15584,7 +15496,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, break; } case RISCVISD::ADD_VL: - if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget)) + if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI)) return V; return combineToVWMACC(N, DAG, Subtarget); case RISCVISD::SUB_VL: @@ -15593,7 +15505,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case RISCVISD::VWSUB_W_VL: case RISCVISD::VWSUBU_W_VL: case RISCVISD::MUL_VL: - return combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget); + return combineBinOp_VLToVWBinOp_VL(N, DCI); case RISCVISD::VFMADD_VL: case RISCVISD::VFNMADD_VL: case RISCVISD::VFMSUB_VL: @@ -18303,20 +18215,9 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI, // split it and then direct call can be matched by PseudoCALL. if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = S->getGlobal(); - - unsigned OpFlags = RISCVII::MO_CALL; - if (!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV)) - OpFlags = RISCVII::MO_PLT; - - Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, RISCVII::MO_CALL); } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - unsigned OpFlags = RISCVII::MO_CALL; - - if (!getTargetMachine().shouldAssumeDSOLocal(*MF.getFunction().getParent(), - nullptr)) - OpFlags = RISCVII::MO_PLT; - - Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, OpFlags); + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), PtrVT, RISCVII::MO_CALL); } // The first call operand is the chain and the second is the target address. @@ -18694,6 +18595,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(UDIV_VL) NODE_NAME_CASE(UREM_VL) NODE_NAME_CASE(XOR_VL) + NODE_NAME_CASE(AVGFLOORU_VL) NODE_NAME_CASE(SADDSAT_VL) NODE_NAME_CASE(UADDSAT_VL) NODE_NAME_CASE(SSUBSAT_VL) @@ -18783,7 +18685,6 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VWMACCSU_VL) NODE_NAME_CASE(VNSRL_VL) NODE_NAME_CASE(SETCC_VL) - NODE_NAME_CASE(VSELECT_VL) NODE_NAME_CASE(VMERGE_VL) NODE_NAME_CASE(VMAND_VL) NODE_NAME_CASE(VMOR_VL) @@ -19357,7 +19258,6 @@ bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const { bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, - bool &IsInc, SelectionDAG &DAG) const { // Target does not support indexed loads. if (!Subtarget.hasVendorXTHeadMemIdx()) @@ -19384,7 +19284,6 @@ bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, if (!isLegalIndexedOffset) return false; - IsInc = (Op->getOpcode() == ISD::ADD); Offset = Op->getOperand(1); return true; } @@ -19407,11 +19306,10 @@ bool RISCVTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, } else return false; - bool IsInc; - if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) + if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, DAG)) return false; - AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; + AM = ISD::PRE_INC; return true; } @@ -19431,15 +19329,14 @@ bool RISCVTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, } else return false; - bool IsInc; - if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) + if (!getIndexedAddressParts(Op, Base, Offset, AM, DAG)) return false; // Post-indexing updates the base, so it's not a valid transform // if that's not the same as the load's pointer. if (Ptr != Base) return false; - AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; + AM = ISD::POST_INC; return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 58ed611efc83..5d51fe168b04 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -253,6 +253,9 @@ enum NodeType : unsigned { SSUBSAT_VL, USUBSAT_VL, + // Averaging adds of unsigned integers. + AVGFLOORU_VL, + MULHS_VL, MULHU_VL, FADD_VL, @@ -330,9 +333,8 @@ enum NodeType : unsigned { // operand is VL. SETCC_VL, - // Vector select with an additional VL operand. This operation is unmasked. - VSELECT_VL, // General vmerge node with mask, true, false, passthru, and vl operands. + // Tail agnostic vselect can be implemented by setting passthru to undef. VMERGE_VL, // Mask binary operators. @@ -526,7 +528,8 @@ public: InstructionCost getVRGatherVVCost(MVT VT) const; InstructionCost getVRGatherVICost(MVT VT) const; - InstructionCost getVSlideCost(MVT VT) const; + InstructionCost getVSlideVXCost(MVT VT) const; + InstructionCost getVSlideVICost(MVT VT) const; // Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -774,8 +777,7 @@ public: bool isVScaleKnownToBeAPowerOfTwo() const override; bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, - ISD::MemIndexedMode &AM, bool &IsInc, - SelectionDAG &DAG) const; + ISD::MemIndexedMode &AM, SelectionDAG &DAG) const; bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; @@ -903,6 +905,7 @@ private: SDValue lowerFixedLengthVectorSelectToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerUnsignedAvgFloor(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG) const; SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index 3400b24e0abb..e591aa935c0b 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -1381,6 +1381,11 @@ void RISCVInsertVSETVLI::doPRE(MachineBasicBlock &MBB) { if (!UnavailablePred || !AvailableInfo.isValid()) return; + // If we don't know the exact VTYPE, we can't copy the vsetvli to the exit of + // the unavailable pred. + if (AvailableInfo.hasSEWLMULRatioOnly()) + return; + // Critical edge - TODO: consider splitting? if (UnavailablePred->succ_size() != 1) return; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index cd98438eed88..351f48c1708e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1346,6 +1346,10 @@ unsigned getPredicatedOpcode(unsigned Opcode) { case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break; case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break; case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break; + + case RISCV::ANDN: return RISCV::PseudoCCANDN; break; + case RISCV::ORN: return RISCV::PseudoCCORN; break; + case RISCV::XNOR: return RISCV::PseudoCCXNOR; break; } return RISCV::INSTRUCTION_LIST_END; @@ -2365,7 +2369,6 @@ RISCVInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { using namespace RISCVII; static const std::pair<unsigned, const char *> TargetFlags[] = { {MO_CALL, "riscv-call"}, - {MO_PLT, "riscv-plt"}, {MO_LO, "riscv-lo"}, {MO_HI, "riscv-hi"}, {MO_PCREL_LO, "riscv-pcrel-lo"}, @@ -2651,6 +2654,7 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI, case RISCV::TH_MULSH: // Operands 2 and 3 are commutable. return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); + case RISCV::PseudoCCMOVGPRNoX0: case RISCV::PseudoCCMOVGPR: // Operands 4 and 5 are commutable. return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 4, 5); @@ -2807,6 +2811,7 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, false, OpIdx1, OpIdx2); } + case RISCV::PseudoCCMOVGPRNoX0: case RISCV::PseudoCCMOVGPR: { // CCMOV can be commuted by inverting the condition. auto CC = static_cast<RISCVCC::CondCode>(MI.getOperand(3).getImm()); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 35e8edf5d2fa..792e0bbdf581 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -729,22 +729,6 @@ def UNIMP : RVInstI<0b001, OPC_SYSTEM, (outs), (ins), "unimp", "">, let imm12 = 0b110000000000; } -let Predicates = [HasStdExtZawrs] in { -def WRS_NTO : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "wrs.nto", "">, - Sched<[]> { - let rs1 = 0; - let rd = 0; - let imm12 = 0b000000001101; -} - -def WRS_STO : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "wrs.sto", "">, - Sched<[]> { - let rs1 = 0; - let rd = 0; - let imm12 = 0b000000011101; -} -} // Predicates = [HasStdExtZawrs] - } // hasSideEffects = 1, mayLoad = 0, mayStore = 0 def CSRRW : CSR_ir<0b001, "csrrw">; @@ -1387,6 +1371,24 @@ def PseudoCCMOVGPR : Pseudo<(outs GPR:$dst), ReadSFBALU, ReadSFBALU]>; } +// This should always expand to a branch+c.mv so the size is 6 or 4 if the +// branch is compressible. +let Predicates = [HasConditionalMoveFusion, NoShortForwardBranchOpt], + Constraints = "$dst = $falsev", isCommutable = 1, Size = 6 in { +// This instruction moves $truev to $dst when the condition is true. It will +// be expanded to control flow in RISCVExpandPseudoInsts. +// We use GPRNoX0 because c.mv cannot encode X0. +def PseudoCCMOVGPRNoX0 : Pseudo<(outs GPRNoX0:$dst), + (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, + GPRNoX0:$falsev, GPRNoX0:$truev), + [(set GPRNoX0:$dst, + (riscv_selectcc_frag:$cc (XLenVT GPR:$lhs), + (XLenVT GPR:$rhs), + cond, (XLenVT GPRNoX0:$truev), + (XLenVT GPRNoX0:$falsev)))]>, + Sched<[]>; +} + // Conditional binops, that updates update $dst to (op rs1, rs2) when condition // is true. Returns $falsev otherwise. Selected by optimizeSelect. // TODO: Can we use DefaultOperands on the regular binop to accomplish this more @@ -1517,6 +1519,23 @@ def PseudoCCSRAIW : Pseudo<(outs GPR:$dst), GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]>; + +// Zbb/Zbkb instructions +def PseudoCCANDN : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, + GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, + Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, + ReadSFBALU, ReadSFBALU, ReadSFBALU]>; +def PseudoCCORN : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, + GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, + Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, + ReadSFBALU, ReadSFBALU, ReadSFBALU]>; +def PseudoCCXNOR : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, + GPR:$falsev, GPR:$rs1, GPR:$rs2), []>, + Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, + ReadSFBALU, ReadSFBALU, ReadSFBALU]>; } multiclass SelectCC_GPR_rrirr<DAGOperand valty, ValueType vt> { @@ -1535,7 +1554,7 @@ multiclass SelectCC_GPR_rrirr<DAGOperand valty, ValueType vt> { (IntCCtoRISCVCC $cc), valty:$truev, valty:$falsev)>; } -let Predicates = [NoShortForwardBranchOpt] in +let Predicates = [NoConditionalMoveFusion] in defm Select_GPR : SelectCC_GPR_rrirr<GPR, XLenVT>; class SelectCompressOpt<CondCode Cond> @@ -2095,6 +2114,7 @@ include "RISCVInstrInfoM.td" // Atomic include "RISCVInstrInfoA.td" +include "RISCVInstrInfoZa.td" // Scalar FP include "RISCVInstrInfoF.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td index c8301fcc6b93..4d0567e41abc 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td @@ -7,8 +7,7 @@ //===----------------------------------------------------------------------===// // // This file describes the RISC-V instructions from the standard 'A', Atomic -// Instructions extension as well as the experimental 'Zacas' (Atomic -// Compare-and-Swap) extension. +// Instructions extension. // //===----------------------------------------------------------------------===// @@ -96,15 +95,6 @@ defm AMOMAXU_D : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">, Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>; } // Predicates = [HasStdExtA, IsRV64] -let Predicates = [HasStdExtZacas] in { -defm AMOCAS_W : AMO_rr_aq_rl<0b00101, 0b010, "amocas.w">; -defm AMOCAS_D : AMO_rr_aq_rl<0b00101, 0b011, "amocas.d">; -} // Predicates = [HasStdExtZacas] - -let Predicates = [HasStdExtZacas, IsRV64] in { -defm AMOCAS_Q : AMO_rr_aq_rl<0b00101, 0b100, "amocas.q">; -} // Predicates = [HasStdExtZacas, IsRV64] - //===----------------------------------------------------------------------===// // Pseudo-instructions and codegen patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 6af710049a9d..418421b2a556 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -36,11 +36,13 @@ def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmINX">; def GPRPF64AsFPR : AsmOperandClass { let Name = "GPRPF64AsFPR"; let ParserMethod = "parseGPRAsFPR"; + let PredicateMethod = "isGPRAsFPR"; let RenderMethod = "addRegOperands"; } def GPRF64AsFPR : AsmOperandClass { let Name = "GPRF64AsFPR"; + let PredicateMethod = "isGPRAsFPR"; let ParserMethod = "parseGPRAsFPR"; let RenderMethod = "addRegOperands"; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 30deeaa06448..fcb18b67623e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -6719,12 +6719,14 @@ defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">; // 15.2. Vector mask population count vcpop //===----------------------------------------------------------------------===// +let IsSignExtendingOpW = 1 in defm PseudoVCPOP: VPseudoVPOP_M; //===----------------------------------------------------------------------===// // 15.3. vfirst find-first-set mask bit //===----------------------------------------------------------------------===// +let IsSignExtendingOpW = 1 in defm PseudoVFIRST: VPseudoV1ST_M; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index b7c845703794..4f87c36506e5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -1131,6 +1131,22 @@ defm : VPatBinarySDNode_VV_VX_VI<uaddsat, "PseudoVSADDU">; defm : VPatBinarySDNode_VV_VX<ssubsat, "PseudoVSSUB">; defm : VPatBinarySDNode_VV_VX<usubsat, "PseudoVSSUBU">; +// 12.2. Vector Single-Width Averaging Add and Subtract +foreach vti = AllIntegerVectors in { + let Predicates = GetVTypePredicates<vti>.Predicates in { + def : Pat<(avgflooru (vti.Vector vti.RegClass:$rs1), + (vti.Vector vti.RegClass:$rs2)), + (!cast<Instruction>("PseudoVAADDU_VV_"#vti.LMul.MX) + (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, vti.RegClass:$rs2, + 0b10, vti.AVL, vti.Log2SEW, TA_MA)>; + def : Pat<(avgflooru (vti.Vector vti.RegClass:$rs1), + (vti.Vector (SplatPat (XLenVT GPR:$rs2)))), + (!cast<Instruction>("PseudoVAADDU_VX_"#vti.LMul.MX) + (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, GPR:$rs2, + 0b10, vti.AVL, vti.Log2SEW, TA_MA)>; + } +} + // 15. Vector Mask Instructions // 15.1. Vector Mask-Register Logical Instructions diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 5b50a4a78c01..d60ff4b5fab0 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -111,6 +111,7 @@ def riscv_ctlz_vl : SDNode<"RISCVISD::CTLZ_VL", SDT_RISCVIntUnOp_VL> def riscv_cttz_vl : SDNode<"RISCVISD::CTTZ_VL", SDT_RISCVIntUnOp_VL>; def riscv_ctpop_vl : SDNode<"RISCVISD::CTPOP_VL", SDT_RISCVIntUnOp_VL>; +def riscv_avgflooru_vl : SDNode<"RISCVISD::AVGFLOORU_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_saddsat_vl : SDNode<"RISCVISD::SADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_uaddsat_vl : SDNode<"RISCVISD::UADDSAT_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; def riscv_ssubsat_vl : SDNode<"RISCVISD::SSUBSAT_VL", SDT_RISCVIntBinOp_VL>; @@ -338,13 +339,6 @@ def riscv_vrgatherei16_vv_vl : SDNode<"RISCVISD::VRGATHEREI16_VV_VL", SDTCisSameNumEltsAs<0, 4>, SDTCisVT<5, XLenVT>]>>; -def SDT_RISCVSelect_VL : SDTypeProfile<1, 4, [ - SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>, - SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisVT<4, XLenVT> -]>; - -def riscv_vselect_vl : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>; - def SDT_RISCVVMERGE_VL : SDTypeProfile<1, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameAs<0, 4>, @@ -1722,21 +1716,21 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> { (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vselect_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask V0), (vti.Vector (op vti.RegClass:$rd, (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2, srcvalue, (vti.Mask true_mask), VLOpFrag), srcvalue, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, VLOpFrag), + vti.RegClass:$rd, undef, VLOpFrag), (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(riscv_vselect_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask V0), (vti.Vector (op vti.RegClass:$rd, (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2, srcvalue, (vti.Mask true_mask), VLOpFrag), srcvalue, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, VLOpFrag), + vti.RegClass:$rd, undef, VLOpFrag), (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; @@ -1861,17 +1855,17 @@ multiclass VPatFPMulAccVL_VV_VF<PatFrag vop, string instruction_name> { (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vselect_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask V0), (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, VLOpFrag), + vti.RegClass:$rd, undef, VLOpFrag), (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(riscv_vselect_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask V0), (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, VLOpFrag), + vti.RegClass:$rd, undef, VLOpFrag), (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; @@ -1905,10 +1899,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> { // RISCVInsertReadWriteCSR FRM_DYN, GPR:$vl, vti.Log2SEW, TU_MU)>; - def : Pat<(riscv_vselect_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask V0), (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, VLOpFrag), + vti.RegClass:$rd, undef, VLOpFrag), (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK") vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2, (vti.Mask V0), @@ -1916,10 +1910,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> { // RISCVInsertReadWriteCSR FRM_DYN, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; - def : Pat<(riscv_vselect_vl (vti.Mask V0), + def : Pat<(riscv_vmerge_vl (vti.Mask V0), (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2, vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)), - vti.RegClass:$rd, VLOpFrag), + vti.RegClass:$rd, undef, VLOpFrag), (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK") vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2, (vti.Mask V0), @@ -2255,31 +2249,6 @@ foreach vtiTowti = AllWidenableIntVectors in { // 11.15. Vector Integer Merge Instructions foreach vti = AllIntegerVectors in { let Predicates = GetVTypePredicates<vti>.Predicates in { - def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0), - vti.RegClass:$rs1, - vti.RegClass:$rs2, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs2, vti.RegClass:$rs1, (vti.Mask V0), - GPR:$vl, vti.Log2SEW)>; - - def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0), - (SplatPat XLenVT:$rs1), - vti.RegClass:$rs2, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs2, GPR:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; - - def : Pat<(vti.Vector (riscv_vselect_vl (vti.Mask V0), - (SplatPat_simm5 simm5:$rs1), - vti.RegClass:$rs2, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX) - (vti.Vector (IMPLICIT_DEF)), - vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>; - def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0), vti.RegClass:$rs1, vti.RegClass:$rs2, @@ -2338,6 +2307,24 @@ defm : VPatBinaryVL_VV_VX_VI<riscv_uaddsat_vl, "PseudoVSADDU">; defm : VPatBinaryVL_VV_VX<riscv_ssubsat_vl, "PseudoVSSUB">; defm : VPatBinaryVL_VV_VX<riscv_usubsat_vl, "PseudoVSSUBU">; +// 12.2. Vector Single-Width Averaging Add and Subtract +foreach vti = AllIntegerVectors in { + let Predicates = GetVTypePredicates<vti>.Predicates in { + def : Pat<(riscv_avgflooru_vl (vti.Vector vti.RegClass:$rs1), + (vti.Vector vti.RegClass:$rs2), + vti.RegClass:$merge, (vti.Mask V0), VLOpFrag), + (!cast<Instruction>("PseudoVAADDU_VV_"#vti.LMul.MX#"_MASK") + vti.RegClass:$merge, vti.RegClass:$rs1, vti.RegClass:$rs2, + (vti.Mask V0), 0b10, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + def : Pat<(riscv_avgflooru_vl (vti.Vector vti.RegClass:$rs1), + (vti.Vector (SplatPat (XLenVT GPR:$rs2))), + vti.RegClass:$merge, (vti.Mask V0), VLOpFrag), + (!cast<Instruction>("PseudoVAADDU_VX_"#vti.LMul.MX#"_MASK") + vti.RegClass:$merge, vti.RegClass:$rs1, GPR:$rs2, + (vti.Mask V0), 0b10, GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>; + } +} + // 12.5. Vector Narrowing Fixed-Point Clip Instructions class VPatTruncSatClipMaxMinBase<string inst, VTypeInfo vti, @@ -2534,33 +2521,6 @@ foreach fvti = AllFloatVectors in { // 13.15. Vector Floating-Point Merge Instruction defvar ivti = GetIntVTypeInfo<fvti>.Vti; let Predicates = GetVTypePredicates<ivti>.Predicates in { - def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0), - fvti.RegClass:$rs1, - fvti.RegClass:$rs2, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0), - GPR:$vl, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0), - (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))), - fvti.RegClass:$rs2, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, - GPR:$imm, - (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; - - def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0), - (SplatFPOp (fvti.Scalar fpimm0)), - fvti.RegClass:$rs2, - VLOpFrag)), - (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), fvti.RegClass:$rs1, fvti.RegClass:$rs2, @@ -2571,6 +2531,16 @@ foreach fvti = AllFloatVectors in { GPR:$vl, fvti.Log2SEW)>; def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), + (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))), + fvti.RegClass:$rs2, + fvti.RegClass:$merge, + VLOpFrag)), + (!cast<Instruction>("PseudoVMERGE_VXM_"#fvti.LMul.MX) + fvti.RegClass:$merge, fvti.RegClass:$rs2, GPR:$imm, (fvti.Mask V0), + GPR:$vl, fvti.Log2SEW)>; + + + def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), (SplatFPOp (fvti.Scalar fpimm0)), fvti.RegClass:$rs2, fvti.RegClass:$merge, @@ -2581,16 +2551,6 @@ foreach fvti = AllFloatVectors in { } let Predicates = GetVTypePredicates<fvti>.Predicates in { - def : Pat<(fvti.Vector (riscv_vselect_vl (fvti.Mask V0), - (SplatFPOp fvti.ScalarRegClass:$rs1), - fvti.RegClass:$rs2, - VLOpFrag)), - (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX) - (fvti.Vector (IMPLICIT_DEF)), - fvti.RegClass:$rs2, - (fvti.Scalar fvti.ScalarRegClass:$rs1), - (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>; - def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), (SplatFPOp fvti.ScalarRegClass:$rs1), fvti.RegClass:$rs2, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td new file mode 100644 index 000000000000..a09f5715b24f --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZa.td @@ -0,0 +1,44 @@ +//===-- RISCVInstrInfoZa.td - RISC-V Atomic instructions ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the RISC-V instructions from the standard atomic 'Za*' +// extensions: +// - Zawrs (v1.0) : Wait-on-Reservation-Set. +// - Zacas (v1.0-rc1) : Atomic Compare-and-Swap. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Zacas (Atomic Compare-and-Swap) +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtZacas] in { +defm AMOCAS_W : AMO_rr_aq_rl<0b00101, 0b010, "amocas.w">; +defm AMOCAS_D : AMO_rr_aq_rl<0b00101, 0b011, "amocas.d">; +} // Predicates = [HasStdExtZacas] + +let Predicates = [HasStdExtZacas, IsRV64] in { +defm AMOCAS_Q : AMO_rr_aq_rl<0b00101, 0b100, "amocas.q">; +} // Predicates = [HasStdExtZacas, IsRV64] + +//===----------------------------------------------------------------------===// +// Zawrs (Wait-on-Reservation-Set) +//===----------------------------------------------------------------------===// + +let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in +class WRSInst<bits<12> funct12, string opcodestr> + : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), opcodestr, ""> { + let rs1 = 0; + let rd = 0; + let imm12 = funct12; +} + +let Predicates = [HasStdExtZawrs] in { +def WRS_NTO : WRSInst<0b000000001101, "wrs.nto">, Sched<[]>; +def WRS_STO : WRSInst<0b000000011101, "wrs.sto">, Sched<[]>; +} // Predicates = [HasStdExtZawrs] diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index 2c2b34bb5b77..c16eee67f3c5 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -126,7 +126,11 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, if (MI->getNumExplicitDefs() != 1) return false; - for (auto &UserOp : MRI.use_nodbg_operands(MI->getOperand(0).getReg())) { + Register DestReg = MI->getOperand(0).getReg(); + if (!DestReg.isVirtual()) + return false; + + for (auto &UserOp : MRI.use_nodbg_operands(DestReg)) { const MachineInstr *UserMI = UserOp.getParent(); unsigned OpIdx = UserOp.getOperandNo(); diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index ba8996e710ed..52800f086129 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -232,7 +232,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", NoSchedModel, FeatureStdExtZba, FeatureStdExtZbb, FeatureStdExtZbs, - FeatureStdExtZfhmin]>; + FeatureStdExtZfhmin], + [TuneConditionalCompressedMoveFusion]>; def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base", SyntacoreSCR1Model, diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 840fd149d681..a59d058382fe 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -487,7 +487,7 @@ defvar VMaskVTs = [vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t, defvar VM1VTs = [vint8m1_t, vint16m1_t, vint32m1_t, vint64m1_t, vbfloat16m1_t, vfloat16m1_t, vfloat32m1_t, vfloat64m1_t, vint8mf2_t, vint8mf4_t, vint8mf8_t, - vint16mf2_t, vint16mf4_t, vint32mf2_t, + vint16mf2_t, vint16mf4_t, vint32mf2_t, vfloat16mf4_t, vfloat16mf2_t, vbfloat16mf4_t, vbfloat16mf2_t, vfloat32mf2_t]; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index 26320b05d9be..2ba93764facd 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -150,6 +150,13 @@ public: bool hasHalfFPLoadStoreMove() const { return HasStdExtZfhmin || HasStdExtZfbfmin; } + + bool hasConditionalMoveFusion() const { + // Do we support fusing a branch+mv or branch+c.mv as a conditional move. + return (hasConditionalCompressedMoveFusion() && hasStdExtCOrZca()) || + hasShortForwardBranchOpt(); + } + bool is64Bit() const { return IsRV64; } MVT getXLenVT() const { return is64Bit() ? MVT::i64 : MVT::i32; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 4614446b2150..b3916c987005 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -34,6 +34,65 @@ static cl::opt<unsigned> SLPMaxVF( "exclusively by SLP vectorizer."), cl::Hidden); +InstructionCost +RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, + TTI::TargetCostKind CostKind) { + size_t NumInstr = OpCodes.size(); + if (CostKind == TTI::TCK_CodeSize) + return NumInstr; + InstructionCost LMULCost = TLI->getLMULCost(VT); + if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency)) + return LMULCost * NumInstr; + InstructionCost Cost = 0; + for (auto Op : OpCodes) { + switch (Op) { + case RISCV::VRGATHER_VI: + Cost += TLI->getVRGatherVICost(VT); + break; + case RISCV::VRGATHER_VV: + Cost += TLI->getVRGatherVVCost(VT); + break; + case RISCV::VSLIDEUP_VI: + case RISCV::VSLIDEDOWN_VI: + Cost += TLI->getVSlideVICost(VT); + break; + case RISCV::VSLIDEUP_VX: + case RISCV::VSLIDEDOWN_VX: + Cost += TLI->getVSlideVXCost(VT); + break; + case RISCV::VREDMAX_VS: + case RISCV::VREDMIN_VS: + case RISCV::VREDMAXU_VS: + case RISCV::VREDMINU_VS: + case RISCV::VREDSUM_VS: + case RISCV::VREDAND_VS: + case RISCV::VREDOR_VS: + case RISCV::VREDXOR_VS: + case RISCV::VFREDMAX_VS: + case RISCV::VFREDMIN_VS: + case RISCV::VFREDUSUM_VS: { + unsigned VL = VT.getVectorMinNumElements(); + if (!VT.isFixedLengthVector()) + VL *= *getVScaleForTuning(); + Cost += Log2_32_Ceil(VL); + break; + } + case RISCV::VFREDOSUM_VS: { + unsigned VL = VT.getVectorMinNumElements(); + if (!VT.isFixedLengthVector()) + VL *= *getVScaleForTuning(); + Cost += VL; + break; + } + case RISCV::VMV_S_X: + // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1 + default: + Cost += LMULCost; + } + } + return Cost; +} + InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy() && @@ -281,7 +340,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // Example sequence: // vnsrl.wi v10, v8, 0 if (equal(DeinterleaveMask, Mask)) - return LT.first * TLI->getLMULCost(LT.second); + return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI, + LT.second, CostKind); } } } @@ -292,7 +352,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, LT.second.getVectorNumElements() <= 256)) { VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, Tp->getContext()); InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); - return IndexCost + TLI->getVRGatherVVCost(LT.second); + return IndexCost + + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); } [[fallthrough]]; } @@ -310,7 +371,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *MaskTy = VectorType::get(IntegerType::getInt1Ty(C), EC); InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind); InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); - return 2 * IndexCost + 2 * TLI->getVRGatherVVCost(LT.second) + MaskCost; + return 2 * IndexCost + + getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV}, + LT.second, CostKind) + + MaskCost; } [[fallthrough]]; } @@ -365,19 +429,24 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // Example sequence: // vsetivli zero, 4, e8, mf2, tu, ma (ignored) // vslidedown.vi v8, v9, 2 - return LT.first * TLI->getVSlideCost(LT.second); + return LT.first * + getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind); case TTI::SK_InsertSubvector: // Example sequence: // vsetivli zero, 4, e8, mf2, tu, ma (ignored) // vslideup.vi v8, v9, 2 - return LT.first * TLI->getVSlideCost(LT.second); + return LT.first * + getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind); case TTI::SK_Select: { // Example sequence: // li a0, 90 // vsetivli zero, 8, e8, mf2, ta, ma (ignored) // vmv.s.x v0, a0 // vmerge.vvm v8, v9, v8, v0 - return LT.first * 3 * TLI->getLMULCost(LT.second); + return LT.first * + (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for li + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM}, + LT.second, CostKind)); } case TTI::SK_Broadcast: { bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) == @@ -389,7 +458,10 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // vsetivli zero, 2, e8, mf8, ta, ma (ignored) // vmv.v.x v8, a0 // vmsne.vi v0, v8, 0 - return LT.first * TLI->getLMULCost(LT.second) * 3; + return LT.first * + (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for andi + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI}, + LT.second, CostKind)); } // Example sequence: // vsetivli zero, 2, e8, mf8, ta, mu (ignored) @@ -400,24 +472,38 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // vmv.v.x v8, a0 // vmsne.vi v0, v8, 0 - return LT.first * TLI->getLMULCost(LT.second) * 6; + return LT.first * + (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi + TLI->getLMULCost( + LT.second) + // FIXME: vmv.x.s is the same as extractelement + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM, + RISCV::VMV_V_X, RISCV::VMSNE_VI}, + LT.second, CostKind)); } if (HasScalar) { // Example sequence: // vmv.v.x v8, a0 - return LT.first * TLI->getLMULCost(LT.second); + return LT.first * + getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind); } // Example sequence: // vrgather.vi v9, v8, 0 - return LT.first * TLI->getVRGatherVICost(LT.second); + return LT.first * + getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind); } - case TTI::SK_Splice: + case TTI::SK_Splice: { // vslidedown+vslideup. // TODO: Multiplying by LT.first implies this legalizes into multiple copies // of similar code, but I think we expand through memory. - return 2 * LT.first * TLI->getVSlideCost(LT.second); + unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX}; + if (Index >= 0 && Index < 32) + Opcodes[0] = RISCV::VSLIDEDOWN_VI; + else if (Index < 0 && Index > -32) + Opcodes[1] = RISCV::VSLIDEUP_VI; + return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind); + } case TTI::SK_Reverse: { // TODO: Cases to improve here: // * Illegal vector types @@ -437,7 +523,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, if (LT.second.isFixedLengthVector()) // vrsub.vi has a 5 bit immediate field, otherwise an li suffices LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1; - InstructionCost GatherCost = 2 + TLI->getVRGatherVVCost(LT.second); + // FIXME: replace the constant `2` below with cost of {VID_V,VRSUB_VX} + InstructionCost GatherCost = + 2 + getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind); // Mask operation additionally required extend and truncate InstructionCost ExtendCost = Tp->getElementType()->isIntegerTy(1) ? 3 : 0; return LT.first * (LenCost + GatherCost + ExtendCost); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 4c955744b37d..7e5dbddb5b51 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -48,6 +48,9 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> { /// actual target hardware. unsigned getEstimatedVLFor(VectorType *Ty); + InstructionCost getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT, + TTI::TargetCostKind CostKind); + /// Return the cost of accessing a constant pool entry of the specified /// type. InstructionCost getConstantPoolLoadCost(Type *Ty, diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp index 629db8e2eb4d..0a8b5499a1fc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp @@ -211,8 +211,8 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx, MDString *MDKernelArgType = getKernelArgAttribute(F, ArgIdx, "kernel_arg_type"); - if (!MDKernelArgType || (MDKernelArgType->getString().ends_with("*") && - MDKernelArgType->getString().ends_with("_t"))) + if (!MDKernelArgType || (!MDKernelArgType->getString().ends_with("*") && + !MDKernelArgType->getString().ends_with("_t"))) return GR->getOrCreateSPIRVType(OriginalArgType, MIRBuilder, ArgAccessQual); if (MDKernelArgType->getString().ends_with("*")) @@ -438,7 +438,8 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, assert(Arg.Regs.size() == 1 && "Call arg has multiple VRegs"); ArgVRegs.push_back(Arg.Regs[0]); SPIRVType *SPIRVTy = GR->getOrCreateSPIRVType(Arg.Ty, MIRBuilder); - GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF()); + if (!GR->getSPIRVTypeForVReg(Arg.Regs[0])) + GR->assignSPIRVTypeToVReg(SPIRVTy, Arg.Regs[0], MIRBuilder.getMF()); } if (auto Res = SPIRV::lowerBuiltin( DemangledName, SPIRV::InstructionSet::OpenCL_std, MIRBuilder, diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 660c574daf38..fb4e9932dd2d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -74,6 +74,7 @@ class SPIRVEmitIntrinsics void processInstrAfterVisit(Instruction *I); void insertAssignPtrTypeIntrs(Instruction *I); void insertAssignTypeIntrs(Instruction *I); + void insertPtrCastInstr(Instruction *I); void processGlobalValue(GlobalVariable &GV); public: @@ -255,7 +256,19 @@ Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) { } Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) { - SmallVector<Type *, 2> Types = {I.getType(), I.getOperand(0)->getType()}; + Value *Source = I.getOperand(0); + + // SPIR-V, contrary to LLVM 17+ IR, supports bitcasts between pointers of + // varying element types. In case of IR coming from older versions of LLVM + // such bitcasts do not provide sufficient information, should be just skipped + // here, and handled in insertPtrCastInstr. + if (I.getType()->isPointerTy()) { + I.replaceAllUsesWith(Source); + I.eraseFromParent(); + return nullptr; + } + + SmallVector<Type *, 2> Types = {I.getType(), Source->getType()}; SmallVector<Value *> Args(I.op_begin(), I.op_end()); auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_bitcast, {Types}, {Args}); std::string InstName = I.hasName() ? I.getName().str() : ""; @@ -265,6 +278,111 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) { return NewI; } +void SPIRVEmitIntrinsics::insertPtrCastInstr(Instruction *I) { + Value *Pointer; + Type *ExpectedElementType; + unsigned OperandToReplace; + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + Pointer = SI->getPointerOperand(); + ExpectedElementType = SI->getValueOperand()->getType(); + OperandToReplace = 1; + } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + Pointer = LI->getPointerOperand(); + ExpectedElementType = LI->getType(); + OperandToReplace = 0; + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { + Pointer = GEPI->getPointerOperand(); + ExpectedElementType = GEPI->getSourceElementType(); + OperandToReplace = 0; + } else { + return; + } + + // If Pointer is the result of nop BitCastInst (ptr -> ptr), use the source + // pointer instead. The BitCastInst should be later removed when visited. + while (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer)) + Pointer = BC->getOperand(0); + + // Do not emit spv_ptrcast if Pointer is a GlobalValue of expected type. + GlobalValue *GV = dyn_cast<GlobalValue>(Pointer); + if (GV && GV->getValueType() == ExpectedElementType) + return; + + // Do not emit spv_ptrcast if Pointer is a result of alloca with expected + // type. + AllocaInst *A = dyn_cast<AllocaInst>(Pointer); + if (A && A->getAllocatedType() == ExpectedElementType) + return; + + if (dyn_cast<GetElementPtrInst>(Pointer)) + return; + + setInsertPointSkippingPhis(*IRB, I); + Constant *ExpectedElementTypeConst = + Constant::getNullValue(ExpectedElementType); + ConstantAsMetadata *CM = + ValueAsMetadata::getConstant(ExpectedElementTypeConst); + MDTuple *TyMD = MDNode::get(F->getContext(), CM); + MetadataAsValue *VMD = MetadataAsValue::get(F->getContext(), TyMD); + unsigned AddressSpace = Pointer->getType()->getPointerAddressSpace(); + bool FirstPtrCastOrAssignPtrType = true; + + // Do not emit new spv_ptrcast if equivalent one already exists or when + // spv_assign_ptr_type already targets this pointer with the same element + // type. + for (auto User : Pointer->users()) { + auto *II = dyn_cast<IntrinsicInst>(User); + if (!II || + (II->getIntrinsicID() != Intrinsic::spv_assign_ptr_type && + II->getIntrinsicID() != Intrinsic::spv_ptrcast) || + II->getOperand(0) != Pointer) + continue; + + // There is some spv_ptrcast/spv_assign_ptr_type already targeting this + // pointer. + FirstPtrCastOrAssignPtrType = false; + if (II->getOperand(1) != VMD || + dyn_cast<ConstantInt>(II->getOperand(2))->getSExtValue() != + AddressSpace) + continue; + + // The spv_ptrcast/spv_assign_ptr_type targeting this pointer is of the same + // element type and address space. + if (II->getIntrinsicID() != Intrinsic::spv_ptrcast) + return; + + // This must be a spv_ptrcast, do not emit new if this one has the same BB + // as I. Otherwise, search for other spv_ptrcast/spv_assign_ptr_type. + if (II->getParent() != I->getParent()) + continue; + + I->setOperand(OperandToReplace, II); + return; + } + + // Do not emit spv_ptrcast if it would cast to the default pointer element + // type (i8) of the same address space. + if (ExpectedElementType->isIntegerTy(8)) + return; + + // If this would be the first spv_ptrcast and there is no spv_assign_ptr_type + // for this pointer before, do not emit spv_ptrcast but emit + // spv_assign_ptr_type instead. + if (FirstPtrCastOrAssignPtrType && isa<Instruction>(Pointer)) { + buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Pointer->getType()}, + ExpectedElementTypeConst, Pointer, + {IRB->getInt32(AddressSpace)}); + return; + } else { + SmallVector<Type *, 2> Types = {Pointer->getType(), Pointer->getType()}; + SmallVector<Value *, 2> Args = {Pointer, VMD, IRB->getInt32(AddressSpace)}; + auto *PtrCastI = + IRB->CreateIntrinsic(Intrinsic::spv_ptrcast, {Types}, Args); + I->setOperand(OperandToReplace, PtrCastI); + return; + } +} + Instruction *SPIRVEmitIntrinsics::visitInsertElementInst(InsertElementInst &I) { SmallVector<Type *, 4> Types = {I.getType(), I.getOperand(0)->getType(), I.getOperand(1)->getType(), @@ -522,13 +640,18 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { for (auto &I : Worklist) { insertAssignPtrTypeIntrs(I); insertAssignTypeIntrs(I); + insertPtrCastInstr(I); } for (auto *I : Worklist) { TrackConstants = true; if (!I->getType()->isVoidTy() || isa<StoreInst>(I)) IRB->SetInsertPoint(I->getNextNode()); + // Visitors return either the original/newly created instruction for further + // processing, nullptr otherwise. I = visit(*I); + if (!I) + continue; processInstrAfterVisit(I); } return true; diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index faaf7f0e2548..061bc9674237 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -289,8 +289,9 @@ static Register convertPtrToInt(Register Reg, LLT ConvTy, SPIRVType *SpirvType, return ConvReg; } -bool SPIRVLegalizerInfo::legalizeCustom(LegalizerHelper &Helper, - MachineInstr &MI) const { +bool SPIRVLegalizerInfo::legalizeCustom( + LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const { auto Opc = MI.getOpcode(); MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); if (!isTypeFoldingSupported(Opc)) { diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h index 2541ff29edb0..f18b15b7f169 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.h @@ -29,7 +29,8 @@ class SPIRVLegalizerInfo : public LegalizerInfo { SPIRVGlobalRegistry *GR; public: - bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override; + bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, + LostDebugLocObserver &LocObserver) const override; SPIRVLegalizerInfo(const SPIRVSubtarget &ST); }; } // namespace llvm diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 1bfce70fedc0..cbc16fa98661 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -125,12 +125,32 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR, SmallVector<MachineInstr *, 10> ToErase; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : MBB) { - if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) + if (!isSpvIntrinsic(MI, Intrinsic::spv_bitcast) && + !isSpvIntrinsic(MI, Intrinsic::spv_ptrcast)) continue; assert(MI.getOperand(2).isReg()); MIB.setInsertPt(*MI.getParent(), MI); - MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); ToErase.push_back(&MI); + if (isSpvIntrinsic(MI, Intrinsic::spv_bitcast)) { + MIB.buildBitcast(MI.getOperand(0).getReg(), MI.getOperand(2).getReg()); + continue; + } + Register Def = MI.getOperand(0).getReg(); + Register Source = MI.getOperand(2).getReg(); + SPIRVType *BaseTy = GR->getOrCreateSPIRVType( + getMDOperandAsType(MI.getOperand(3).getMetadata(), 0), MIB); + SPIRVType *AssignedPtrType = GR->getOrCreateSPIRVPointerType( + BaseTy, MI, *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo(), + addressSpaceToStorageClass(MI.getOperand(4).getImm())); + + // If the bitcast would be redundant, replace all uses with the source + // register. + if (GR->getSPIRVTypeForVReg(Source) == AssignedPtrType) { + MIB.getMRI()->replaceRegWith(Def, Source); + } else { + GR->assignSPIRVTypeToVReg(AssignedPtrType, Def, MF); + MIB.buildBitcast(Def, Source); + } } } for (MachineInstr *MI : ToErase) @@ -587,6 +607,40 @@ static void processSwitches(MachineFunction &MF, SPIRVGlobalRegistry *GR, } } +static bool isImplicitFallthrough(MachineBasicBlock &MBB) { + if (MBB.empty()) + return true; + + // Branching SPIR-V intrinsics are not detected by this generic method. + // Thus, we can only trust negative result. + if (!MBB.canFallThrough()) + return false; + + // Otherwise, we must manually check if we have a SPIR-V intrinsic which + // prevent an implicit fallthrough. + for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); + It != E; ++It) { + if (isSpvIntrinsic(*It, Intrinsic::spv_switch)) + return false; + } + return true; +} + +static void removeImplicitFallthroughs(MachineFunction &MF, + MachineIRBuilder MIB) { + // It is valid for MachineBasicBlocks to not finish with a branch instruction. + // In such cases, they will simply fallthrough their immediate successor. + for (MachineBasicBlock &MBB : MF) { + if (!isImplicitFallthrough(MBB)) + continue; + + assert(std::distance(MBB.successors().begin(), MBB.successors().end()) == + 1); + MIB.setInsertPt(MBB, MBB.end()); + MIB.buildBr(**MBB.successors().begin()); + } +} + bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { // Initialize the type registry. const SPIRVSubtarget &ST = MF.getSubtarget<SPIRVSubtarget>(); @@ -599,6 +653,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) { generateAssignInstrs(MF, GR, MIB); processSwitches(MF, GR, MIB); processInstrsWithTypeFolding(MF, GR, MIB); + removeImplicitFallthroughs(MF, MIB); return true; } diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 1503f263e42c..62d9090d289f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -29,6 +29,7 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Pass.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Utils.h" #include <optional> using namespace llvm; @@ -151,6 +152,19 @@ TargetPassConfig *SPIRVTargetMachine::createPassConfig(PassManagerBase &PM) { } void SPIRVPassConfig::addIRPasses() { + if (TM.getSubtargetImpl()->isVulkanEnv()) { + // Once legalized, we need to structurize the CFG to follow the spec. + // This is done through the following 8 steps. + // TODO(#75801): add the remaining steps. + + // 1. Simplify loop for subsequent transformations. After this steps, loops + // have the following properties: + // - loops have a single entry edge (pre-header to loop header). + // - all loop exits are dominated by the loop pre-header. + // - loops have a single back-edge. + addPass(createLoopSimplifyPass()); + } + TargetPassConfig::addIRPasses(); addPass(createSPIRVRegularizerPass()); addPass(createSPIRVPrepareFunctionsPass(TM)); diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 1c0e8d84e2fd..d4f7d8e89af5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -228,8 +228,8 @@ uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) { return MI->getOperand(1).getCImm()->getValue().getZExtValue(); } -bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) { - if (auto *GI = dyn_cast<GIntrinsic>(&MI)) +bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID) { + if (const auto *GI = dyn_cast<GIntrinsic>(&MI)) return GI->is(IntrinsicID); return false; } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index 30fae6c7de47..60742e2f2728 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -79,7 +79,7 @@ MachineInstr *getDefInstrMaybeConstant(Register &ConstReg, uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI); // Check if MI is a SPIR-V specific intrinsic call. -bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID); +bool isSpvIntrinsic(const MachineInstr &MI, Intrinsic::ID IntrinsicID); // Get type of i-th operand of the metadata node. Type *getMDOperandAsType(const MDNode *N, unsigned I); diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index c7d8591c5bdf..320f91c76057 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -1641,7 +1641,7 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) { // If this is a 64-bit constant that is out of the range of LLILF, // LLIHF and LGFI, split it into two 32-bit pieces. if (Node->getValueType(0) == MVT::i64) { - uint64_t Val = cast<ConstantSDNode>(Node)->getZExtValue(); + uint64_t Val = Node->getAsZExtVal(); if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val) && !isInt<32>(Val)) { splitLargeImmediate(ISD::OR, Node, SDValue(), Val - uint32_t(Val), uint32_t(Val)); @@ -1677,10 +1677,8 @@ void SystemZDAGToDAGISel::Select(SDNode *Node) { isInt<16>(cast<ConstantSDNode>(Op0)->getSExtValue())))) { SDValue CCValid = Node->getOperand(2); SDValue CCMask = Node->getOperand(3); - uint64_t ConstCCValid = - cast<ConstantSDNode>(CCValid.getNode())->getZExtValue(); - uint64_t ConstCCMask = - cast<ConstantSDNode>(CCMask.getNode())->getZExtValue(); + uint64_t ConstCCValid = CCValid.getNode()->getAsZExtVal(); + uint64_t ConstCCMask = CCMask.getNode()->getAsZExtVal(); // Invert the condition. CCMask = CurDAG->getTargetConstant(ConstCCValid ^ ConstCCMask, SDLoc(Node), CCMask.getValueType()); diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 045c4c0aac07..2450c6801a66 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -2662,10 +2662,8 @@ static void adjustForFNeg(Comparison &C) { // with (sext (trunc X)) into a comparison with (shl X, 32). static void adjustForLTGFR(Comparison &C) { // Check for a comparison between (shl X, 32) and 0. - if (C.Op0.getOpcode() == ISD::SHL && - C.Op0.getValueType() == MVT::i64 && - C.Op1.getOpcode() == ISD::Constant && - cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) { + if (C.Op0.getOpcode() == ISD::SHL && C.Op0.getValueType() == MVT::i64 && + C.Op1.getOpcode() == ISD::Constant && C.Op1->getAsZExtVal() == 0) { auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1)); if (C1 && C1->getZExtValue() == 32) { SDValue ShlOp0 = C.Op0.getOperand(0); @@ -2690,7 +2688,7 @@ static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL, C.Op0.getOperand(0).getOpcode() == ISD::LOAD && C.Op1.getOpcode() == ISD::Constant && cast<ConstantSDNode>(C.Op1)->getValueSizeInBits(0) <= 64 && - cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) { + C.Op1->getAsZExtVal() == 0) { auto *L = cast<LoadSDNode>(C.Op0.getOperand(0)); if (L->getMemoryVT().getStoreSizeInBits().getFixedValue() <= C.Op0.getValueSizeInBits().getFixedValue()) { @@ -3035,12 +3033,12 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1, CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) && isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid)) return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, - cast<ConstantSDNode>(CmpOp1)->getZExtValue(), Cond); + CmpOp1->getAsZExtVal(), Cond); if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN && CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 && isIntrinsicWithCC(CmpOp0, Opcode, CCValid)) return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, - cast<ConstantSDNode>(CmpOp1)->getZExtValue(), Cond); + CmpOp1->getAsZExtVal(), Cond); } Comparison C(CmpOp0, CmpOp1, Chain); C.CCMask = CCMaskForCondCode(Cond); @@ -3457,12 +3455,11 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op, // Check for absolute and negative-absolute selections, including those // where the comparison value is sign-extended (for LPGFR and LNGFR). // This check supplements the one in DAGCombiner. - if (C.Opcode == SystemZISD::ICMP && - C.CCMask != SystemZ::CCMASK_CMP_EQ && + if (C.Opcode == SystemZISD::ICMP && C.CCMask != SystemZ::CCMASK_CMP_EQ && C.CCMask != SystemZ::CCMASK_CMP_NE && C.Op1.getOpcode() == ISD::Constant && cast<ConstantSDNode>(C.Op1)->getValueSizeInBits(0) <= 64 && - cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) { + C.Op1->getAsZExtVal() == 0) { if (isAbsolute(C.Op0, TrueOp, FalseOp)) return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT); if (isAbsolute(C.Op0, FalseOp, TrueOp)) @@ -3947,8 +3944,7 @@ SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_XPLINK(SDValue Op, // If user has set the no alignment function attribute, ignore // alloca alignments. - uint64_t AlignVal = - (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0); + uint64_t AlignVal = (RealignOpt ? Align->getAsZExtVal() : 0); uint64_t StackAlign = TFI->getStackAlignment(); uint64_t RequiredAlign = std::max(AlignVal, StackAlign); @@ -4013,8 +4009,7 @@ SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_ELF(SDValue Op, // If user has set the no alignment function attribute, ignore // alloca alignments. - uint64_t AlignVal = - (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0); + uint64_t AlignVal = (RealignOpt ? Align->getAsZExtVal() : 0); uint64_t StackAlign = TFI->getStackAlignment(); uint64_t RequiredAlign = std::max(AlignVal, StackAlign); @@ -4213,7 +4208,7 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const { // If the low part is a constant that is outside the range of LHI, // then we're better off using IILF. if (LowOp.getOpcode() == ISD::Constant) { - int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue()); + int64_t Value = int32_t(LowOp->getAsZExtVal()); if (!isInt<16>(Value)) return Op; } @@ -5897,7 +5892,7 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, Op1.getOpcode() != ISD::BITCAST && Op1.getOpcode() != ISD::ConstantFP && Op2.getOpcode() == ISD::Constant) { - uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue(); + uint64_t Index = Op2->getAsZExtVal(); unsigned Mask = VT.getVectorNumElements() - 1; if (Index <= Mask) return Op; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 37abbb072cdd..15dc44a04395 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -278,7 +278,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) { unsigned Idx = cast<ConstantInt>(Op)->getZExtValue(); TmpOffset += SL->getElementOffset(Idx); } else { - uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + uint64_t S = GTI.getSequentialElementStride(DL); for (;;) { if (const auto *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4bcf89690505..7c47790d1e35 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1869,8 +1869,7 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op, Ops[OpIdx++] = Op.getOperand(2); while (OpIdx < 18) { const SDValue &MaskIdx = Op.getOperand(OpIdx + 1); - if (MaskIdx.isUndef() || - cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) { + if (MaskIdx.isUndef() || MaskIdx.getNode()->getAsZExtVal() >= 32) { bool isTarget = MaskIdx.getNode()->getOpcode() == ISD::TargetConstant; Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32, isTarget); } else { @@ -1912,7 +1911,7 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, const SDNode *Index = Extract.getOperand(1).getNode(); if (!isa<ConstantSDNode>(Index)) return SDValue(); - unsigned IndexVal = cast<ConstantSDNode>(Index)->getZExtValue(); + unsigned IndexVal = Index->getAsZExtVal(); unsigned Scale = ExtractedVecT.getVectorNumElements() / VecT.getVectorNumElements(); assert(Scale > 1); @@ -2335,7 +2334,7 @@ WebAssemblyTargetLowering::LowerAccessVectorElement(SDValue Op, SDNode *IdxNode = Op.getOperand(Op.getNumOperands() - 1).getNode(); if (isa<ConstantSDNode>(IdxNode)) { // Ensure the index type is i32 to match the tablegen patterns - uint64_t Idx = cast<ConstantSDNode>(IdxNode)->getZExtValue(); + uint64_t Idx = IdxNode->getAsZExtVal(); SmallVector<SDValue, 3> Ops(Op.getNode()->ops()); Ops[Op.getNumOperands() - 1] = DAG.getConstant(Idx, SDLoc(IdxNode), MVT::i32); diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h index 1f69feceae27..12134f7b00f1 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h @@ -21,7 +21,6 @@ namespace llvm { class X86Subtarget; class X86TargetMachine; -/// This class provides the information for the target register banks. class X86LegalizerInfo : public LegalizerInfo { private: /// Keep a reference to the X86Subtarget around so that we can diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index e006dd877360..304b998e1f26 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -148,25 +148,21 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) { case X86::AND16ri8: case X86::AND16rm: case X86::AND16rr: - case X86::AND16rr_REV: case X86::AND32i32: case X86::AND32ri: case X86::AND32ri8: case X86::AND32rm: case X86::AND32rr: - case X86::AND32rr_REV: case X86::AND64i32: case X86::AND64ri32: case X86::AND64ri8: case X86::AND64rm: case X86::AND64rr: - case X86::AND64rr_REV: case X86::AND8i8: case X86::AND8ri: case X86::AND8ri8: case X86::AND8rm: case X86::AND8rr: - case X86::AND8rr_REV: return FirstMacroFusionInstKind::And; // CMP case X86::CMP16i16: @@ -175,28 +171,24 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) { case X86::CMP16ri8: case X86::CMP16rm: case X86::CMP16rr: - case X86::CMP16rr_REV: case X86::CMP32i32: case X86::CMP32mr: case X86::CMP32ri: case X86::CMP32ri8: case X86::CMP32rm: case X86::CMP32rr: - case X86::CMP32rr_REV: case X86::CMP64i32: case X86::CMP64mr: case X86::CMP64ri32: case X86::CMP64ri8: case X86::CMP64rm: case X86::CMP64rr: - case X86::CMP64rr_REV: case X86::CMP8i8: case X86::CMP8mr: case X86::CMP8ri: case X86::CMP8ri8: case X86::CMP8rm: case X86::CMP8rr: - case X86::CMP8rr_REV: return FirstMacroFusionInstKind::Cmp; // ADD case X86::ADD16i16: @@ -204,50 +196,42 @@ classifyFirstOpcodeInMacroFusion(unsigned Opcode) { case X86::ADD16ri8: case X86::ADD16rm: case X86::ADD16rr: - case X86::ADD16rr_REV: case X86::ADD32i32: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD32rm: case X86::ADD32rr: - case X86::ADD32rr_REV: case X86::ADD64i32: case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64rm: case X86::ADD64rr: - case X86::ADD64rr_REV: case X86::ADD8i8: case X86::ADD8ri: case X86::ADD8ri8: case X86::ADD8rm: case X86::ADD8rr: - case X86::ADD8rr_REV: // SUB case X86::SUB16i16: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB16rm: case X86::SUB16rr: - case X86::SUB16rr_REV: case X86::SUB32i32: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB32rm: case X86::SUB32rr: - case X86::SUB32rr_REV: case X86::SUB64i32: case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB64rm: case X86::SUB64rr: - case X86::SUB64rr_REV: case X86::SUB8i8: case X86::SUB8ri: case X86::SUB8ri8: case X86::SUB8rm: case X86::SUB8rr: - case X86::SUB8rr_REV: return FirstMacroFusionInstKind::AddSub; // INC case X86::INC16r: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 924956295e7c..f7c361393fea 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -1650,6 +1650,9 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) ++SrcRegNum; + if (IsND) // Skip new data destination + ++CurOp; + emitRegModRMByte(MI.getOperand(SrcRegNum), getX86RegNum(MI.getOperand(CurOp)), CB); CurOp = SrcRegNum + 1; diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 485afbc1dfbc..21623a805f55 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -131,9 +131,9 @@ FunctionPass *createX86FixupBWInsts(); /// to another, when profitable. FunctionPass *createX86DomainReassignmentPass(); -/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX -/// encoding when possible in order to reduce code size. -FunctionPass *createX86EvexToVexInsts(); +/// This pass compress instructions from EVEX space to legacy/VEX/EVEX space when +/// possible in order to reduce code size or facilitate HW decoding. +FunctionPass *createX86CompressEVEXPass(); /// This pass creates the thunks for the retpoline feature. FunctionPass *createX86IndirectThunksPass(); @@ -167,7 +167,7 @@ FunctionPass *createX86SpeculativeLoadHardeningPass(); FunctionPass *createX86SpeculativeExecutionSideEffectSuppression(); FunctionPass *createX86ArgumentStackSlotPass(); -void initializeEvexToVexInstPassPass(PassRegistry &); +void initializeCompressEVEXPassPass(PassRegistry &); void initializeFPSPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index c425c37b4186..b95baddd9dea 100644 --- a/llvm/lib/Target/X86/X86EvexToVex.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -1,5 +1,4 @@ -//===- X86EvexToVex.cpp ---------------------------------------------------===// -// Compress EVEX instructions to VEX encoding when possible to reduce code size +//===- X86CompressEVEX.cpp ------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,17 +6,30 @@ // //===----------------------------------------------------------------------===// // -/// \file -/// This file defines the pass that goes over all AVX-512 instructions which -/// are encoded using the EVEX prefix and if possible replaces them by their -/// corresponding VEX encoding which is usually shorter by 2 bytes. -/// EVEX instructions may be encoded via the VEX prefix when the AVX-512 -/// instruction has a corresponding AVX/AVX2 opcode, when vector length -/// accessed by instruction is less than 512 bits and when it does not use -// the xmm or the mask registers or xmm/ymm registers with indexes higher -// than 15. -/// The pass applies code reduction on the generated code for AVX-512 instrs. +// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space +// when possible in order to reduce code size or facilitate HW decoding. // +// Possible compression: +// a. AVX512 instruction (EVEX) -> AVX instruction (VEX) +// b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy/VEX) +// c. NDD (EVEX) -> non-NDD (legacy) +// d. NF_ND (EVEX) -> NF (EVEX) +// +// Compression a, b and c can always reduce code size, with some exceptions +// such as promoted 16-bit CRC32 which is as long as the legacy version. +// +// legacy: +// crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6] +// promoted: +// crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6] +// +// From performance perspective, these should be same (same uops and same EXE +// ports). From a FMV perspective, an older legacy encoding is preferred b/c it +// can execute in more places (broader HW install base). So we will still do +// the compression. +// +// Compression d can help hardware decode (HW may skip reading the NDD +// register) although the instruction length remains unchanged. //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86BaseInfo.h" @@ -38,37 +50,34 @@ using namespace llvm; -// Including the generated EVEX2VEX tables. -struct X86EvexToVexCompressTableEntry { - uint16_t EvexOpc; - uint16_t VexOpc; +// Including the generated EVEX compression tables. +struct X86CompressEVEXTableEntry { + uint16_t OldOpc; + uint16_t NewOpc; - bool operator<(const X86EvexToVexCompressTableEntry &RHS) const { - return EvexOpc < RHS.EvexOpc; + bool operator<(const X86CompressEVEXTableEntry &RHS) const { + return OldOpc < RHS.OldOpc; } - friend bool operator<(const X86EvexToVexCompressTableEntry &TE, - unsigned Opc) { - return TE.EvexOpc < Opc; + friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) { + return TE.OldOpc < Opc; } }; -#include "X86GenEVEX2VEXTables.inc" +#include "X86GenCompressEVEXTables.inc" -#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible" -#define EVEX2VEX_NAME "x86-evex-to-vex-compress" +#define COMP_EVEX_DESC "Compressing EVEX instrs when possible" +#define COMP_EVEX_NAME "x86-compress-evex" -#define DEBUG_TYPE EVEX2VEX_NAME +#define DEBUG_TYPE COMP_EVEX_NAME namespace { -class EvexToVexInstPass : public MachineFunctionPass { +class CompressEVEXPass : public MachineFunctionPass { public: static char ID; - EvexToVexInstPass() : MachineFunctionPass(ID) {} - StringRef getPassName() const override { return EVEX2VEX_DESC; } + CompressEVEXPass() : MachineFunctionPass(ID) {} + StringRef getPassName() const override { return COMP_EVEX_DESC; } - /// Loop over all of the basic blocks, replacing EVEX instructions - /// by equivalent VEX instructions when possible for reducing code size. bool runOnMachineFunction(MachineFunction &MF) override; // This pass runs after regalloc and doesn't support VReg operands. @@ -80,7 +89,7 @@ public: } // end anonymous namespace -char EvexToVexInstPass::ID = 0; +char CompressEVEXPass::ID = 0; static bool usesExtendedRegister(const MachineInstr &MI) { auto isHiRegIdx = [](unsigned Reg) { @@ -112,8 +121,8 @@ static bool usesExtendedRegister(const MachineInstr &MI) { return false; } -static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) { - switch (EvexOpc) { +static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) { + switch (OldOpc) { default: return true; case X86::VCVTNEPS2BF16Z128rm: @@ -151,15 +160,15 @@ static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) { } // Do any custom cleanup needed to finalize the conversion. -static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) { - (void)VexOpc; +static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { + (void)NewOpc; unsigned Opc = MI.getOpcode(); switch (Opc) { case X86::VALIGNDZ128rri: case X86::VALIGNDZ128rmi: case X86::VALIGNQZ128rri: case X86::VALIGNQZ128rmi: { - assert((VexOpc == X86::VPALIGNRrri || VexOpc == X86::VPALIGNRrmi) && + assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) && "Unexpected new opcode!"); unsigned Scale = (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4; @@ -175,8 +184,8 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) { case X86::VSHUFI32X4Z256rri: case X86::VSHUFI64X2Z256rmi: case X86::VSHUFI64X2Z256rri: { - assert((VexOpc == X86::VPERM2F128rr || VexOpc == X86::VPERM2I128rr || - VexOpc == X86::VPERM2F128rm || VexOpc == X86::VPERM2I128rm) && + assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr || + NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) && "Unexpected new opcode!"); MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); int64_t ImmVal = Imm.getImm(); @@ -200,7 +209,7 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) { case X86::VRNDSCALESDZm_Int: case X86::VRNDSCALESSZr_Int: case X86::VRNDSCALESSZm_Int: - const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1); + const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1); int64_t ImmVal = Imm.getImm(); // Ensure that only bits 3:0 of the immediate are used. if ((ImmVal & 0xf) != ImmVal) @@ -211,86 +220,77 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) { return true; } -// For EVEX instructions that can be encoded using VEX encoding -// replace them by the VEX encoding in order to reduce size. -static bool CompressEvexToVexImpl(MachineInstr &MI, const X86Subtarget &ST) { - // VEX format. - // # of bytes: 0,2,3 1 1 0,1 0,1,2,4 0,1 - // [Prefixes] [VEX] OPCODE ModR/M [SIB] [DISP] [IMM] - // - // EVEX format. - // # of bytes: 4 1 1 1 4 / 1 1 - // [Prefixes] EVEX Opcode ModR/M [SIB] [Disp32] / [Disp8*N] [Immediate] - const MCInstrDesc &Desc = MI.getDesc(); +static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) { + uint64_t TSFlags = MI.getDesc().TSFlags; // Check for EVEX instructions only. - if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX) + if ((TSFlags & X86II::EncodingMask) != X86II::EVEX) return false; - // Check for EVEX instructions with mask or broadcast as in these cases - // the EVEX prefix is needed in order to carry this information - // thus preventing the transformation to VEX encoding. - if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B)) + // Instructions with mask or 512-bit vector can't be converted to VEX. + if (TSFlags & (X86II::EVEX_K | X86II::EVEX_L2)) return false; - // Check for EVEX instructions with L2 set. These instructions are 512-bits - // and can't be converted to VEX. - if (Desc.TSFlags & X86II::EVEX_L2) + // EVEX_B has several meanings. + // AVX512: + // register form: rounding control or SAE + // memory form: broadcast + // + // APX: + // MAP4: NDD + // + // For AVX512 cases, EVEX prefix is needed in order to carry this information + // thus preventing the transformation to VEX encoding. + if (TSFlags & X86II::EVEX_B) return false; - // Use the VEX.L bit to select the 128 or 256-bit table. - ArrayRef<X86EvexToVexCompressTableEntry> Table = - (Desc.TSFlags & X86II::VEX_L) ? ArrayRef(X86EvexToVex256CompressTable) - : ArrayRef(X86EvexToVex128CompressTable); + ArrayRef<X86CompressEVEXTableEntry> Table = ArrayRef(X86CompressEVEXTable); - unsigned EvexOpc = MI.getOpcode(); - const auto *I = llvm::lower_bound(Table, EvexOpc); - if (I == Table.end() || I->EvexOpc != EvexOpc) + unsigned Opc = MI.getOpcode(); + const auto *I = llvm::lower_bound(Table, Opc); + if (I == Table.end() || I->OldOpc != Opc) return false; - if (usesExtendedRegister(MI)) - return false; - if (!checkVEXInstPredicate(EvexOpc, ST)) - return false; - if (!performCustomAdjustments(MI, I->VexOpc)) + if (usesExtendedRegister(MI) || !checkVEXInstPredicate(Opc, ST) || + !performCustomAdjustments(MI, I->NewOpc)) return false; - MI.setDesc(ST.getInstrInfo()->get(I->VexOpc)); - MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX); + const MCInstrDesc &NewDesc = ST.getInstrInfo()->get(I->NewOpc); + MI.setDesc(NewDesc); + uint64_t Encoding = NewDesc.TSFlags & X86II::EncodingMask; + auto AsmComment = + (Encoding == X86II::VEX) ? X86::AC_EVEX_2_VEX : X86::AC_EVEX_2_LEGACY; + MI.setAsmPrinterFlag(AsmComment); return true; } -bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { +bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) { #ifndef NDEBUG // Make sure the tables are sorted. static std::atomic<bool> TableChecked(false); if (!TableChecked.load(std::memory_order_relaxed)) { - assert(llvm::is_sorted(X86EvexToVex128CompressTable) && - "X86EvexToVex128CompressTable is not sorted!"); - assert(llvm::is_sorted(X86EvexToVex256CompressTable) && - "X86EvexToVex256CompressTable is not sorted!"); + assert(llvm::is_sorted(X86CompressEVEXTable) && + "X86CompressEVEXTable is not sorted!"); TableChecked.store(true, std::memory_order_relaxed); } #endif const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - if (!ST.hasAVX512()) + if (!ST.hasAVX512() && !ST.hasEGPR() && !ST.hasNDD()) return false; bool Changed = false; - /// Go over all basic blocks in function and replace - /// EVEX encoded instrs by VEX encoding when possible. for (MachineBasicBlock &MBB : MF) { // Traverse the basic block. for (MachineInstr &MI : MBB) - Changed |= CompressEvexToVexImpl(MI, ST); + Changed |= CompressEVEXImpl(MI, ST); } return Changed; } -INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false) +INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false) -FunctionPass *llvm::createX86EvexToVexInsts() { - return new EvexToVexInstPass(); +FunctionPass *llvm::createX86CompressEVEXPass() { + return new CompressEVEXPass(); } diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index bdd86e48fa54..20dbaf797e32 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -619,40 +619,30 @@ void X86DomainReassignment::initConverters() { std::make_unique<InstrReplacerDstCOPY>(From, To); }; - bool HasEGPR = STI->hasEGPR(); - createReplacerDstCOPY(X86::MOVZX32rm16, - HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm); - createReplacerDstCOPY(X86::MOVZX64rm16, - HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm); +#define GET_EGPR_IF_ENABLED(OPC) STI->hasEGPR() ? OPC##_EVEX : OPC + createReplacerDstCOPY(X86::MOVZX32rm16, GET_EGPR_IF_ENABLED(X86::KMOVWkm)); + createReplacerDstCOPY(X86::MOVZX64rm16, GET_EGPR_IF_ENABLED(X86::KMOVWkm)); - createReplacerDstCOPY(X86::MOVZX32rr16, - HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk); - createReplacerDstCOPY(X86::MOVZX64rr16, - HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk); + createReplacerDstCOPY(X86::MOVZX32rr16, GET_EGPR_IF_ENABLED(X86::KMOVWkk)); + createReplacerDstCOPY(X86::MOVZX64rr16, GET_EGPR_IF_ENABLED(X86::KMOVWkk)); if (STI->hasDQI()) { - createReplacerDstCOPY(X86::MOVZX16rm8, - HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm); - createReplacerDstCOPY(X86::MOVZX32rm8, - HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm); - createReplacerDstCOPY(X86::MOVZX64rm8, - HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm); - - createReplacerDstCOPY(X86::MOVZX16rr8, - HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk); - createReplacerDstCOPY(X86::MOVZX32rr8, - HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk); - createReplacerDstCOPY(X86::MOVZX64rr8, - HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk); + createReplacerDstCOPY(X86::MOVZX16rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm)); + createReplacerDstCOPY(X86::MOVZX32rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm)); + createReplacerDstCOPY(X86::MOVZX64rm8, GET_EGPR_IF_ENABLED(X86::KMOVBkm)); + + createReplacerDstCOPY(X86::MOVZX16rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk)); + createReplacerDstCOPY(X86::MOVZX32rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk)); + createReplacerDstCOPY(X86::MOVZX64rr8, GET_EGPR_IF_ENABLED(X86::KMOVBkk)); } auto createReplacer = [&](unsigned From, unsigned To) { Converters[{MaskDomain, From}] = std::make_unique<InstrReplacer>(From, To); }; - createReplacer(X86::MOV16rm, HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm); - createReplacer(X86::MOV16mr, HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk); - createReplacer(X86::MOV16rr, HasEGPR ? X86::KMOVWkk_EVEX : X86::KMOVWkk); + createReplacer(X86::MOV16rm, GET_EGPR_IF_ENABLED(X86::KMOVWkm)); + createReplacer(X86::MOV16mr, GET_EGPR_IF_ENABLED(X86::KMOVWmk)); + createReplacer(X86::MOV16rr, GET_EGPR_IF_ENABLED(X86::KMOVWkk)); createReplacer(X86::SHR16ri, X86::KSHIFTRWri); createReplacer(X86::SHL16ri, X86::KSHIFTLWri); createReplacer(X86::NOT16r, X86::KNOTWrr); @@ -661,14 +651,14 @@ void X86DomainReassignment::initConverters() { createReplacer(X86::XOR16rr, X86::KXORWrr); if (STI->hasBWI()) { - createReplacer(X86::MOV32rm, HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm); - createReplacer(X86::MOV64rm, HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm); + createReplacer(X86::MOV32rm, GET_EGPR_IF_ENABLED(X86::KMOVDkm)); + createReplacer(X86::MOV64rm, GET_EGPR_IF_ENABLED(X86::KMOVQkm)); - createReplacer(X86::MOV32mr, HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk); - createReplacer(X86::MOV64mr, HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk); + createReplacer(X86::MOV32mr, GET_EGPR_IF_ENABLED(X86::KMOVDmk)); + createReplacer(X86::MOV64mr, GET_EGPR_IF_ENABLED(X86::KMOVQmk)); - createReplacer(X86::MOV32rr, HasEGPR ? X86::KMOVDkk_EVEX : X86::KMOVDkk); - createReplacer(X86::MOV64rr, HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk); + createReplacer(X86::MOV32rr, GET_EGPR_IF_ENABLED(X86::KMOVDkk)); + createReplacer(X86::MOV64rr, GET_EGPR_IF_ENABLED(X86::KMOVQkk)); createReplacer(X86::SHR32ri, X86::KSHIFTRDri); createReplacer(X86::SHR64ri, X86::KSHIFTRQri); @@ -696,8 +686,8 @@ void X86DomainReassignment::initConverters() { // TODO: KTEST is not a replacement for TEST due to flag differences. Need // to prove only Z flag is used. - //createReplacer(X86::TEST32rr, X86::KTESTDrr); - //createReplacer(X86::TEST64rr, X86::KTESTQrr); + // createReplacer(X86::TEST32rr, X86::KTESTDrr); + // createReplacer(X86::TEST64rr, X86::KTESTQrr); } if (STI->hasDQI()) { @@ -706,9 +696,9 @@ void X86DomainReassignment::initConverters() { createReplacer(X86::AND8rr, X86::KANDBrr); - createReplacer(X86::MOV8rm, HasEGPR ? X86::KMOVBkm_EVEX : X86::KMOVBkm); - createReplacer(X86::MOV8mr, HasEGPR ? X86::KMOVBmk_EVEX : X86::KMOVBmk); - createReplacer(X86::MOV8rr, HasEGPR ? X86::KMOVBkk_EVEX : X86::KMOVBkk); + createReplacer(X86::MOV8rm, GET_EGPR_IF_ENABLED(X86::KMOVBkm)); + createReplacer(X86::MOV8mr, GET_EGPR_IF_ENABLED(X86::KMOVBmk)); + createReplacer(X86::MOV8rr, GET_EGPR_IF_ENABLED(X86::KMOVBkk)); createReplacer(X86::NOT8r, X86::KNOTBrr); @@ -719,11 +709,12 @@ void X86DomainReassignment::initConverters() { // TODO: KTEST is not a replacement for TEST due to flag differences. Need // to prove only Z flag is used. - //createReplacer(X86::TEST8rr, X86::KTESTBrr); - //createReplacer(X86::TEST16rr, X86::KTESTWrr); + // createReplacer(X86::TEST8rr, X86::KTESTBrr); + // createReplacer(X86::TEST16rr, X86::KTESTWrr); createReplacer(X86::XOR8rr, X86::KXORBrr); } +#undef GET_EGPR_IF_ENABLED } bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 0ba31e173a1a..1ce1e6f6a563 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -916,7 +916,7 @@ redo_gep: // A array/variable index is always of the form i*S where S is the // constant scale size. See if we can push the scale into immediates. - uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + uint64_t S = GTI.getSequentialElementStride(DL); for (;;) { if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { // Constant-offset addressing. @@ -3046,22 +3046,24 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic."); +#define GET_EGPR_IF_ENABLED(OPC) Subtarget->hasEGPR() ? OPC##_EVEX : OPC case Intrinsic::x86_sse42_crc32_32_8: - Opc = X86::CRC32r32r8; + Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r8); RC = &X86::GR32RegClass; break; case Intrinsic::x86_sse42_crc32_32_16: - Opc = X86::CRC32r32r16; + Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r16); RC = &X86::GR32RegClass; break; case Intrinsic::x86_sse42_crc32_32_32: - Opc = X86::CRC32r32r32; + Opc = GET_EGPR_IF_ENABLED(X86::CRC32r32r32); RC = &X86::GR32RegClass; break; case Intrinsic::x86_sse42_crc32_64_64: - Opc = X86::CRC32r64r64; + Opc = GET_EGPR_IF_ENABLED(X86::CRC32r64r64); RC = &X86::GR64RegClass; break; +#undef GET_EGPR_IF_ENABLED } const Value *LHS = II->getArgOperand(0); diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index b13bf361ab79..aad839b83ee1 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -173,7 +173,6 @@ static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) { #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC) \ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr) \ - LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV) \ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm) \ LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr) \ case X86::MNEMONIC##8ri: \ diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 73b10cf3067e..53ce720be2da 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -2852,7 +2852,7 @@ bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; - AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue(); + AM.Scale = ScaleOp->getAsZExtVal(); // Attempt to match index patterns, as long as we're not relying on implicit // sign-extension, which is performed BEFORE scale. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1e4b1361f98a..5a28240ea9e2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7371,7 +7371,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { - int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); + int Idx = ExtIdx->getAsZExtVal(); if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) return Idx; @@ -7475,10 +7475,12 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); - MVT IVT = VT.changeVectorElementTypeToInteger(); + MVT IVT = + VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16); SmallVector<SDValue, 16> NewOps; for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) - NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I))); + NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16, + Op.getOperand(I))); SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps); return DAG.getBitcast(VT, Res); } @@ -8793,7 +8795,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); - unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue(); + unsigned InsertC = InsIndex->getAsZExtVal(); unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); if (InsertC < NumEltsInLow128Bits) return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); @@ -14369,6 +14371,13 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + if (VT == MVT::v8bf16) { + V1 = DAG.getBitcast(MVT::v8i16, V1); + V2 = DAG.getBitcast(MVT::v8i16, V2); + return DAG.getBitcast(VT, + DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask)); + } + switch (VT.SimpleTy) { case MVT::v2i64: return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); @@ -17096,14 +17105,14 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); } - if (VT == MVT::v32f16) { + if (VT == MVT::v32f16 || VT == MVT::v32bf16) { if (!Subtarget.hasBWI()) return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); V1 = DAG.getBitcast(MVT::v32i16, V1); V2 = DAG.getBitcast(MVT::v32i16, V2); - return DAG.getBitcast(MVT::v32f16, + return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask)); } @@ -17747,7 +17756,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + unsigned IdxVal = Idx->getAsZExtVal(); SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); @@ -21515,9 +21524,8 @@ SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op, RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16); SDValue Res = - makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first; - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, - DAG.getBitcast(MVT::i32, Res)); + makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first; + return DAG.getBitcast(MVT::i16, Res); } /// Depending on uarch and/or optimizing for size, we might prefer to use a @@ -24061,7 +24069,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // a >= b ? -1 : 0 -> RES = setcc_carry // a >= b ? 0 : -1 -> RES = ~setcc_carry if (Cond.getOpcode() == X86ISD::SUB) { - unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); + unsigned CondCode = CC->getAsZExtVal(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && @@ -25359,8 +25367,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (IntrData->Type == INTR_TYPE_3OP_IMM8 && Src3.getValueType() != MVT::i8) { - Src3 = DAG.getTargetConstant( - cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8); + Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8); } // We specify 2 possible opcodes for intrinsics with rounding modes. @@ -25385,8 +25392,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant); SDValue Src4 = Op.getOperand(4); if (Src4.getValueType() != MVT::i8) { - Src4 = DAG.getTargetConstant( - cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8); + Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8); } return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), @@ -26788,7 +26794,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, {Chain, Op1, Op2, Size}, VT, MMO); Chain = Res.getValue(1); Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT); - unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); + unsigned Imm = Op2->getAsZExtVal(); if (Imm) Res = DAG.getNode(ISD::SHL, DL, VT, Res, DAG.getShiftAmountConstant(Imm, VT, DL)); @@ -40221,6 +40227,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } return SDValue(); } + case X86ISD::SHUF128: { + // If we're permuting the upper 256-bits subvectors of a concatenation, then + // see if we can peek through and access the subvector directly. + if (VT.is512BitVector()) { + // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the + // upper subvector is used. + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + uint64_t Mask = N->getConstantOperandVal(2); + SmallVector<SDValue> LHSOps, RHSOps; + SDValue NewLHS, NewRHS; + if ((Mask & 0x0A) == 0x0A && + collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) { + NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512); + Mask &= ~0x0A; + } + if ((Mask & 0xA0) == 0xA0 && + collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) { + NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512); + Mask &= ~0xA0; + } + if (NewLHS || NewRHS) + return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS, + NewRHS ? NewRHS : RHS, + DAG.getTargetConstant(Mask, DL, MVT::i8)); + } + return SDValue(); + } case X86ISD::VPERM2X128: { // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)). SDValue LHS = N->getOperand(0); @@ -41320,6 +41354,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( return TLO.CombineTo(Op, Src); break; } + case X86ISD::VZEXT_LOAD: { + // If upper demanded elements are not demanded then simplify to a + // scalar_to_vector(load()). + MVT SVT = VT.getSimpleVT().getVectorElementType(); + if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) { + SDLoc DL(Op); + auto *Mem = cast<MemSDNode>(Op); + SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(), + Mem->getMemOperand()); + SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt); + return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec)); + } + break; + } case X86ISD::VBROADCAST: { SDValue Src = Op.getOperand(0); MVT SrcVT = Src.getSimpleValueType(); @@ -41795,7 +41843,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); - unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue(); + unsigned ShAmt = Op1->getAsZExtVal(); if (ShAmt >= BitWidth) break; @@ -42580,7 +42628,7 @@ static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) { APInt Imm(SrcVT.getVectorNumElements(), 0); for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) { SDValue In = Op.getOperand(Idx); - if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1)) + if (!In.isUndef() && (In->getAsZExtVal() & 0x1)) Imm.setBit(Idx); } EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth()); @@ -49931,18 +49979,17 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, SDValue Ptr = Ld->getBasePtr(); SDValue Chain = Ld->getChain(); for (SDNode *User : Chain->uses()) { - if (User != N && + auto *UserLd = dyn_cast<MemSDNode>(User); + if (User != N && UserLd && (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD || User->getOpcode() == X86ISD::VBROADCAST_LOAD || ISD::isNormalLoad(User)) && - cast<MemSDNode>(User)->getChain() == Chain && - !User->hasAnyUseOfValue(1) && + UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) && User->getValueSizeInBits(0).getFixedValue() > RegVT.getFixedSizeInBits()) { if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && - cast<MemSDNode>(User)->getBasePtr() == Ptr && - cast<MemSDNode>(User)->getMemoryVT().getSizeInBits() == - MemVT.getSizeInBits()) { + UserLd->getBasePtr() == Ptr && + UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) { SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits()); Extract = DAG.getBitcast(RegVT, Extract); @@ -49961,7 +50008,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, // See if we are loading a constant that matches in the lower // bits of a longer constant (but from a different constant pool ptr). EVT UserVT = User->getValueType(0); - SDValue UserPtr = cast<MemSDNode>(User)->getBasePtr(); + SDValue UserPtr = UserLd->getBasePtr(); const Constant *LdC = getTargetConstantFromBasePtr(Ptr); const Constant *UserC = getTargetConstantFromBasePtr(UserPtr); if (LdC && UserC && UserPtr != Ptr) { @@ -53258,7 +53305,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, if (Index.getOpcode() == ISD::ADD && Index.getValueType().getVectorElementType() == PtrVT && isa<ConstantSDNode>(Scale)) { - uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue(); + uint64_t ScaleAmt = Scale->getAsZExtVal(); if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) { BitVector UndefElts; if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) { @@ -54572,6 +54619,14 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT()) return Op0.getOperand(0); } + + // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x)) + if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() && + !X86::mayFoldLoad(Op0.getOperand(0), Subtarget)) + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, + Op0.getOperand(0), Op0.getOperand(0)), + Op0.getOperand(1)); } // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128. @@ -54979,6 +55034,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2)); } break; + case X86ISD::BLENDI: + if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) { + uint64_t Mask0 = Ops[0].getConstantOperandVal(2); + uint64_t Mask1 = Ops[1].getConstantOperandVal(2); + uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0; + MVT MaskSVT = MVT::getIntegerVT(VT.getVectorNumElements()); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Sel = + DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT)); + return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1), + ConcatSubOperand(VT, Ops, 0)); + } + break; case ISD::VSELECT: if (!IsSplat && Subtarget.hasAVX512() && (VT.is256BitVector() || @@ -57602,7 +57670,7 @@ X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const { } Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { - if (ML->isInnermost() && + if (ML && ML->isInnermost() && ExperimentalPrefInnermostLoopAlignment.getNumOccurrences()) return Align(1ULL << ExperimentalPrefInnermostLoopAlignment); return TargetLowering::getPrefLoopAlignment(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 9bd1622cb0d3..32745400a38b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1714,16 +1714,6 @@ namespace llvm { MachineBasicBlock *Entry, const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; - bool splitValueIntoRegisterParts( - SelectionDAG & DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, - unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) - const override; - - SDValue joinRegisterPartsIntoValue( - SelectionDAG & DAG, const SDLoc &DL, const SDValue *Parts, - unsigned NumParts, MVT PartVT, EVT ValueVT, - std::optional<CallingConv::ID> CC) const override; - bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index b8b5421b9005..d75bd4171fde 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -127,6 +127,9 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, return getRegisterTypeForCallingConv(Context, CC, VT.changeVectorElementType(MVT::f16)); + if (VT == MVT::bf16) + return MVT::f16; + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -421,40 +424,6 @@ unsigned X86TargetLowering::getJumpTableEncoding() const { return TargetLowering::getJumpTableEncoding(); } -bool X86TargetLowering::splitValueIntoRegisterParts( - SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, - unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { - bool IsABIRegCopy = CC.has_value(); - EVT ValueVT = Val.getValueType(); - if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { - unsigned ValueBits = ValueVT.getSizeInBits(); - unsigned PartBits = PartVT.getSizeInBits(); - Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); - Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); - Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); - Parts[0] = Val; - return true; - } - return false; -} - -SDValue X86TargetLowering::joinRegisterPartsIntoValue( - SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, - MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const { - bool IsABIRegCopy = CC.has_value(); - if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { - unsigned ValueBits = ValueVT.getSizeInBits(); - unsigned PartBits = PartVT.getSizeInBits(); - SDValue Val = Parts[0]; - - Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); - Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); - Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); - return Val; - } - return SDValue(); -} - bool X86TargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index c3a673f97d34..fe7d90fbcdf7 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -448,7 +448,7 @@ multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, X86VectorVTInfo< 2, EltVT64, VR128X>, X86VectorVTInfo< 4, EltVT64, VR256X>, null_frag, vinsert128_insert, sched>, - VEX_W1X, EVEX_V256; + EVEX_V256, REX_W; // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasDQI] in { @@ -750,7 +750,7 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128, X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 2, EltVT64, VR128X>, null_frag, vextract128_extract, SchedRR, SchedMR>, - VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>; + EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W; // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasDQI] in { @@ -1161,7 +1161,7 @@ multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr, defm VBROADCASTSS : avx512_fp_broadcast_ss<0x18, "vbroadcastss", avx512vl_f32_info>; defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd", - avx512vl_f64_info>, VEX_W1X; + avx512vl_f64_info>, REX_W; multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR, X86VectorVTInfo _, SDPatternOperator OpNode, @@ -1267,7 +1267,7 @@ defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", avx512vl_i32_info, HasAVX512, 1>; defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", - avx512vl_i64_info, HasAVX512, 1>, VEX_W1X; + avx512vl_i64_info, HasAVX512, 1>, REX_W; multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, @@ -1460,11 +1460,11 @@ let Predicates = [HasBF16, HasVLX] in let Predicates = [HasVLX, HasDQI] in { defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", - X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, VEX_W1X, - EVEX_V256, EVEX_CD8<64, CD8VT2>; + X86SubVBroadcastld128, v4i64x_info, v2i64x_info>, + EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W; defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", - X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, VEX_W1X, - EVEX_V256, EVEX_CD8<64, CD8VT2>; + X86SubVBroadcastld128, v4f64x_info, v2f64x_info>, + EVEX_V256, EVEX_CD8<64, CD8VT2>, REX_W; // Patterns for selects of bitcasted operations. def : Pat<(vselect_mask VK4WM:$mask, @@ -3185,15 +3185,13 @@ defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>; multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name, X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload, - X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd, - bit NoRMPattern = 0, + X86SchedWriteMoveLS Sched, bit NoRMPattern = 0, SDPatternOperator SelectOprr = vselect> { let hasSideEffects = 0 in { let isMoveReg = 1 in def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], - _.ExeDomain>, EVEX, Sched<[Sched.RR]>, - EVEX2VEXOverride<EVEX2VEXOvrd#"rr">; + _.ExeDomain>, EVEX, Sched<[Sched.RR]>; def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", @@ -3209,8 +3207,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name, !if(NoRMPattern, [], [(set _.RC:$dst, (_.VT (ld_frag addr:$src)))]), - _.ExeDomain>, EVEX, Sched<[Sched.RM]>, - EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; + _.ExeDomain>, EVEX, Sched<[Sched.RM]>; let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), @@ -3253,53 +3250,48 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name, multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, X86SchedWriteMoveLSWidths Sched, - string EVEX2VEXOvrd, bit NoRMPattern = 0> { + bit NoRMPattern = 0> { let Predicates = [prd] in defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.AlignedLdFrag, masked_load_aligned, - Sched.ZMM, "", NoRMPattern>, EVEX_V512; + Sched.ZMM, NoRMPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.AlignedLdFrag, masked_load_aligned, - Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256; + Sched.YMM, NoRMPattern>, EVEX_V256; defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.AlignedLdFrag, masked_load_aligned, - Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128; + Sched.XMM, NoRMPattern>, EVEX_V128; } } multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, X86SchedWriteMoveLSWidths Sched, - string EVEX2VEXOvrd, bit NoRMPattern = 0, + bit NoRMPattern = 0, SDPatternOperator SelectOprr = vselect> { let Predicates = [prd] in defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag, - masked_load, Sched.ZMM, "", - NoRMPattern, SelectOprr>, EVEX_V512; + masked_load, Sched.ZMM, NoRMPattern, SelectOprr>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag, - masked_load, Sched.YMM, EVEX2VEXOvrd#"Y", - NoRMPattern, SelectOprr>, EVEX_V256; + masked_load, Sched.YMM, NoRMPattern, SelectOprr>, EVEX_V256; defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag, - masked_load, Sched.XMM, EVEX2VEXOvrd, - NoRMPattern, SelectOprr>, EVEX_V128; + masked_load, Sched.XMM, NoRMPattern, SelectOprr>, EVEX_V128; } } multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName, X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, - X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd, - bit NoMRPattern = 0> { + X86SchedWriteMoveLS Sched, bit NoMRPattern = 0> { let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { let isMoveReg = 1 in def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr # "\t{$src, $dst|$dst, $src}", [], _.ExeDomain>, EVEX, - Sched<[Sched.RR]>, - EVEX2VEXOverride<EVEX2VEXOvrd#"rr_REV">; + Sched<[Sched.RR]>; def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, ${dst} {${mask}}|"# @@ -3319,8 +3311,7 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), !if(NoMRPattern, [], [(st_frag (_.VT _.RC:$src), addr:$dst)]), - _.ExeDomain>, EVEX, Sched<[Sched.MR]>, - EVEX2VEXOverride<EVEX2VEXOvrd#"mr">; + _.ExeDomain>, EVEX, Sched<[Sched.MR]>; def mrk : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", @@ -3344,102 +3335,92 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName, multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, X86SchedWriteMoveLSWidths Sched, - string EVEX2VEXOvrd, bit NoMRPattern = 0> { + bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store, - masked_store, Sched.ZMM, "", - NoMRPattern>, EVEX_V512; + masked_store, Sched.ZMM, NoMRPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store, - masked_store, Sched.YMM, - EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256; + masked_store, Sched.YMM, NoMRPattern>, EVEX_V256; defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store, - masked_store, Sched.XMM, EVEX2VEXOvrd, - NoMRPattern>, EVEX_V128; + masked_store, Sched.XMM, NoMRPattern>, EVEX_V128; } } multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, X86SchedWriteMoveLSWidths Sched, - string EVEX2VEXOvrd, bit NoMRPattern = 0> { + bit NoMRPattern = 0> { let Predicates = [prd] in defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore, - masked_store_aligned, Sched.ZMM, "", - NoMRPattern>, EVEX_V512; + masked_store_aligned, Sched.ZMM, NoMRPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore, - masked_store_aligned, Sched.YMM, - EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256; + masked_store_aligned, Sched.YMM, NoMRPattern>, EVEX_V256; defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore, - masked_store_aligned, Sched.XMM, EVEX2VEXOvrd, - NoMRPattern>, EVEX_V128; + masked_store_aligned, Sched.XMM, NoMRPattern>, EVEX_V128; } } defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, - HasAVX512, SchedWriteFMoveLS, "VMOVAPS">, + HasAVX512, SchedWriteFMoveLS>, avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, - HasAVX512, SchedWriteFMoveLS, "VMOVAPS">, + HasAVX512, SchedWriteFMoveLS>, TB, EVEX_CD8<32, CD8VF>; defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, - HasAVX512, SchedWriteFMoveLS, "VMOVAPD">, + HasAVX512, SchedWriteFMoveLS>, avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, - HasAVX512, SchedWriteFMoveLS, "VMOVAPD">, + HasAVX512, SchedWriteFMoveLS>, TB, PD, REX_W, EVEX_CD8<64, CD8VF>; defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, - SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>, + SchedWriteFMoveLS, 0, null_frag>, avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512, - SchedWriteFMoveLS, "VMOVUPS">, + SchedWriteFMoveLS>, TB, EVEX_CD8<32, CD8VF>; defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, - SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>, + SchedWriteFMoveLS, 0, null_frag>, avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512, - SchedWriteFMoveLS, "VMOVUPD">, + SchedWriteFMoveLS>, TB, PD, REX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, - HasAVX512, SchedWriteVecMoveLS, - "VMOVDQA", 1>, + HasAVX512, SchedWriteVecMoveLS, 1>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, - HasAVX512, SchedWriteVecMoveLS, - "VMOVDQA", 1>, + HasAVX512, SchedWriteVecMoveLS, 1>, TB, PD, EVEX_CD8<32, CD8VF>; defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, - HasAVX512, SchedWriteVecMoveLS, - "VMOVDQA">, + HasAVX512, SchedWriteVecMoveLS>, avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, - HasAVX512, SchedWriteVecMoveLS, - "VMOVDQA">, + HasAVX512, SchedWriteVecMoveLS>, TB, PD, REX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, TB, XD, EVEX_CD8<8, CD8VF>; defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, TB, XD, REX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, - SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>, + SchedWriteVecMoveLS, 1, null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512, - SchedWriteVecMoveLS, "VMOVDQU", 1>, + SchedWriteVecMoveLS, 1>, TB, XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, - SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>, + SchedWriteVecMoveLS, 0, null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512, - SchedWriteVecMoveLS, "VMOVDQU">, + SchedWriteVecMoveLS>, TB, XS, REX_W, EVEX_CD8<64, CD8VF>; // Special instructions to help with spilling when we don't have VLX. We need @@ -4844,8 +4825,7 @@ defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, SchedWriteVecIMul, HasBWI, 1>; defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, - SchedWriteVecIMul, HasDQI, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecIMul, HasDQI, 1>, T8; defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul, HasBWI, 1>; defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul, @@ -4989,8 +4969,7 @@ defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax, defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax, SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax, - SchedWriteVecALU, HasAVX512, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax, SchedWriteVecALU, HasBWI, 1>; @@ -4999,8 +4978,7 @@ defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax, defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax, SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax, - SchedWriteVecALU, HasAVX512, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin, SchedWriteVecALU, HasBWI, 1>, T8; @@ -5009,8 +4987,7 @@ defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin, defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin, SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin, - SchedWriteVecALU, HasAVX512, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin, SchedWriteVecALU, HasBWI, 1>; @@ -5019,8 +4996,7 @@ defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin, defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin, SchedWriteVecALU, HasAVX512, 1>, T8; defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin, - SchedWriteVecALU, HasAVX512, 1>, T8, - NotEVEX2VEXConvertible; + SchedWriteVecALU, HasAVX512, 1>, T8; // PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. let Predicates = [HasDQI, NoVLX] in { @@ -5405,8 +5381,7 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo } multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode, SDNode VecNode, SDNode SaeNode, - X86FoldableSchedWrite sched, bit IsCommutable, - string EVEX2VexOvrd> { + X86FoldableSchedWrite sched, bit IsCommutable> { let ExeDomain = _.ExeDomain in { defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, @@ -5427,8 +5402,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, - Sched<[sched]>, - EVEX2VEXOverride<EVEX2VexOvrd#"rr"> { + Sched<[sched]> { let isCommutable = IsCommutable; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5436,8 +5410,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>, - EVEX2VEXOverride<EVEX2VexOvrd#"rm">; + Sched<[sched.Folded, sched.ReadAfterFold]>; } let Uses = [MXCSR] in @@ -5474,19 +5447,15 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode VecNode, SDNode SaeNode, X86SchedWriteSizes sched, bit IsCommutable> { defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode, - VecNode, SaeNode, sched.PS.Scl, IsCommutable, - NAME#"SS">, + VecNode, SaeNode, sched.PS.Scl, IsCommutable>, TB, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode, - VecNode, SaeNode, sched.PD.Scl, IsCommutable, - NAME#"SD">, + VecNode, SaeNode, sched.PD.Scl, IsCommutable>, TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>; let Predicates = [HasFP16] in { defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode, - VecNode, SaeNode, sched.PH.Scl, IsCommutable, - NAME#"SH">, - T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, - NotEVEX2VEXConvertible; + VecNode, SaeNode, sched.PH.Scl, IsCommutable>, + T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>; } } defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds, @@ -5506,14 +5475,13 @@ defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxSAEs, // X86fminc and X86fmaxc instead of X86fmin and X86fmax multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, SDNode OpNode, - X86FoldableSchedWrite sched, - string EVEX2VEXOvrd> { + X86FoldableSchedWrite sched> { let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, - Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr"> { + Sched<[sched]> { let isCommutable = 1; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), @@ -5521,36 +5489,34 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))]>, - Sched<[sched.Folded, sched.ReadAfterFold]>, - EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; + Sched<[sched.Folded, sched.ReadAfterFold]>; } } defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc, - SchedWriteFCmp.Scl, "VMINCSS">, TB, XS, + SchedWriteFCmp.Scl>, TB, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC; defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc, - SchedWriteFCmp.Scl, "VMINCSD">, TB, XD, + SchedWriteFCmp.Scl>, TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, SIMD_EXC; defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc, - SchedWriteFCmp.Scl, "VMAXCSS">, TB, XS, + SchedWriteFCmp.Scl>, TB, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC; defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, - SchedWriteFCmp.Scl, "VMAXCSD">, TB, XD, + SchedWriteFCmp.Scl>, TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, SIMD_EXC; defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc, - SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5, XS, - EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, - NotEVEX2VEXConvertible; + SchedWriteFCmp.Scl>, T_MAP5, XS, + EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC; + defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc, - SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5, XS, - EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC, - NotEVEX2VEXConvertible; + SchedWriteFCmp.Scl>, T_MAP5, XS, + EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC; multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDPatternOperator MaskOpNode, @@ -5820,8 +5786,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6, PD; } } -defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", - SchedWriteFAdd>, NotEVEX2VEXConvertible; +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", SchedWriteFAdd>; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions @@ -5985,11 +5950,9 @@ multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw, string OpcodeStr, SDNode OpNode, - X86SchedWriteWidths sched, - bit NotEVEX2VEXConvertibleQ = 0> { + X86SchedWriteWidths sched> { defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32, avx512vl_i32_info, HasAVX512>; - let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64, avx512vl_i64_info, HasAVX512>, REX_W; defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16, @@ -6034,11 +5997,9 @@ multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM, multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode, - X86SchedWriteWidths sched, - bit NotEVEX2VEXConvertibleQ = 0> { + X86SchedWriteWidths sched> { defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode, sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; - let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode, sched, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, REX_W; } @@ -6054,7 +6015,7 @@ defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli, SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV; defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai, - SchedWriteVecShiftImm, 1>, + SchedWriteVecShiftImm>, avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai, SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV; @@ -6066,7 +6027,7 @@ defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli, defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SchedWriteVecShift>; defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, - SchedWriteVecShift, 1>; + SchedWriteVecShift>; defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SchedWriteVecShift>; @@ -6435,7 +6396,7 @@ defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, avx512vl_i32_info>; let ExeDomain = SSEPackedDouble in defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, - avx512vl_i64_info>, VEX_W1X; + avx512vl_i64_info>, REX_W; //===----------------------------------------------------------------------===// // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW @@ -8443,9 +8404,9 @@ multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo } let Predicates = [HasDQI, HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode, - MaskOpNode, sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible; + MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode, - MaskOpNode, sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible; + MaskOpNode, sched.YMM>, EVEX_V256; } } @@ -8524,11 +8485,10 @@ multiclass avx512_cvtqq2ps_dq2ph<bits<8> opc, string OpcodeStr, SDPatternOperato defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info128, null_frag, null_frag, sched.XMM, _src.info128.BroadcastStr, "{x}", i128mem, _src.info128.KRCWM>, - EVEX_V128, NotEVEX2VEXConvertible; + EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info256, OpNode, MaskOpNode, sched.YMM, _src.info256.BroadcastStr, - "{y}">, EVEX_V256, - NotEVEX2VEXConvertible; + "{y}">, EVEX_V256; // Special patterns to allow use of X86VM[SU]intToFP for masking. Instruction // patterns have been disabled with null_frag. @@ -10882,8 +10842,7 @@ defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info, multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, - X86VectorVTInfo CastInfo, - string EVEX2VEXOvrd> { + X86VectorVTInfo CastInfo> { let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), @@ -10891,7 +10850,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, (_.VT (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))))>, - Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">; + Sched<[sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", @@ -10900,8 +10859,7 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, (CastInfo.VT (X86Shuf128 _.RC:$src1, (CastInfo.LdFrag addr:$src2), (i8 timm:$src3)))))>, - Sched<[sched.Folded, sched.ReadAfterFold]>, - EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; + Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1", @@ -10918,45 +10876,40 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _, - AVX512VLVectorVTInfo CastInfo, bits<8> opc, - string EVEX2VEXOvrd>{ + AVX512VLVectorVTInfo CastInfo, bits<8> opc>{ let Predicates = [HasAVX512] in defm Z : avx512_shuff_packed_128_common<opc, OpcodeStr, sched, - _.info512, CastInfo.info512, "">, EVEX_V512; + _.info512, CastInfo.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in defm Z256 : avx512_shuff_packed_128_common<opc, OpcodeStr, sched, - _.info256, CastInfo.info256, - EVEX2VEXOvrd>, EVEX_V256; + _.info256, CastInfo.info256>, EVEX_V256; } defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256, - avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; + avx512vl_f32_info, avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256, - avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; + avx512vl_f64_info, avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256, - avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; + avx512vl_i32_info, avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>; defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256, - avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; + avx512vl_i64_info, avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W; multiclass avx512_valign<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _>{ - // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the - // instantiation of this class. let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 timm:$src3)))>, - Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">; + Sched<[sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86VAlign _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)), (i8 timm:$src3)))>, - Sched<[sched.Folded, sched.ReadAfterFold]>, - EVEX2VEXOverride<"VPALIGNRrmi">; + Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), @@ -10979,7 +10932,6 @@ multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched, defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>, AVX512AIi8Base, EVEX, VVVV, EVEX_V128; // We can't really override the 256-bit version so change it back to unset. - let EVEX2VEXOverride = ? in defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>, AVX512AIi8Base, EVEX, VVVV, EVEX_V256; } @@ -11111,7 +11063,7 @@ let Predicates = [HasVLX, HasBWI] in { defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw", SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>, - EVEX_CD8<8, CD8VF>, NotEVEX2VEXConvertible; + EVEX_CD8<8, CD8VF>; multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86FoldableSchedWrite sched, X86VectorVTInfo _> { @@ -13088,12 +13040,10 @@ multiclass avx512_cvtqq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo let Predicates = [HasFP16, HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v2i64x_info, null_frag, null_frag, sched.XMM, "{1to2}", "{x}", - i128mem, VK2WM>, - EVEX_V128, NotEVEX2VEXConvertible; + i128mem, VK2WM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v4i64x_info, null_frag, null_frag, sched.YMM, "{1to4}", "{y}", - i256mem, VK4WM>, - EVEX_V256, NotEVEX2VEXConvertible; + i256mem, VK4WM>, EVEX_V256; } def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}", diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index 6b0c1b8c28c9..5cfa95e085e3 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -71,24 +71,60 @@ multiclass Mul<bits<8> o, string m, Format RegMRM, Format MemMRM, SDPatternOpera // FIXME: Used for 8-bit mul, ignore result upper 8 bits. // This probably ought to be moved to a def : Pat<> if the // syntax can be accepted. - let Defs = [AL,EFLAGS,AX], Uses = [AL] in - def 8r : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, - [(set AL, (node AL, GR8:$src1)), (implicit EFLAGS)]>; - let Defs = [AX,DX,EFLAGS], Uses = [AX] in - def 16r : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16; - let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in - def 32r : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32; - let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in - def 64r : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>; - let Defs = [AL,EFLAGS,AX], Uses = [AL] in - def 8m : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, - [(set AL, (node AL, (loadi8 addr:$src1))), (implicit EFLAGS)]>; - let Defs = [AX,DX,EFLAGS], Uses = [AX] in - def 16m : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16; - let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in - def 32m : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32; - let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in - def 64m : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>; + let Defs = [AL, EFLAGS, AX], Uses = [AL] in + def 8r : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, + [(set AL, (node AL, GR8:$src1)), (implicit EFLAGS)]>; + let Defs = [AX, DX, EFLAGS], Uses = [AX] in + def 16r : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16; + let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in + def 32r : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32; + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in + def 64r : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>; + let Defs = [AL, EFLAGS, AX], Uses = [AL] in + def 8m : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, + [(set AL, (node AL, (loadi8 addr:$src1))), (implicit EFLAGS)]>; + let Defs = [AX, DX, EFLAGS], Uses = [AX] in + def 16m : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16; + let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in + def 32m : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32; + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in + def 64m : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>; + + let Predicates = [In64BitMode] in { + let Defs = [AL, AX], Uses = [AL] in + def 8r_NF : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, []>, NF; + let Defs = [AX, DX], Uses = [AX] in + def 16r_NF : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, NF, PD; + let Defs = [EAX, EDX], Uses = [EAX] in + def 32r_NF : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, NF; + let Defs = [RAX, RDX], Uses = [RAX] in + def 64r_NF : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>, NF; + let Defs = [AL, AX], Uses = [AL] in + def 8m_NF : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, []>, NF; + let Defs = [AX, DX], Uses = [AX] in + def 16m_NF : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, NF, PD; + let Defs = [EAX, EDX], Uses = [EAX] in + def 32m_NF : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, NF; + let Defs = [RAX, RDX], Uses = [RAX] in + def 64m_NF : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, NF; + + let Defs = [AL, EFLAGS, AX], Uses = [AL] in + def 8r_EVEX : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8, []>, PL; + let Defs = [AX, DX, EFLAGS], Uses = [AX] in + def 16r_EVEX : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, PL, PD; + let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in + def 32r_EVEX : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, PL; + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in + def 64r_EVEX : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>, PL; + let Defs = [AL, EFLAGS, AX], Uses = [AL] in + def 8m_EVEX : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8, []>, PL; + let Defs = [AX, DX, EFLAGS], Uses = [AX] in + def 16m_EVEX : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, PL, PD; + let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in + def 32m_EVEX : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, PL; + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX] in + def 64m_EVEX : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, PL; + } } defm MUL : Mul<0xF7, "mul", MRM4r, MRM4m, mul>; @@ -99,137 +135,341 @@ multiclass Div<bits<8> o, string m, Format RegMRM, Format MemMRM> { defvar sched16 = !if(!eq(m, "div"), WriteDiv16, WriteIDiv16); defvar sched32 = !if(!eq(m, "div"), WriteDiv32, WriteIDiv32); defvar sched64 = !if(!eq(m, "div"), WriteDiv64, WriteIDiv64); - let Defs = [AL,AH,EFLAGS], Uses = [AX] in - def 8r : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>; - let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in - def 16r : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16; - let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in - def 32r : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32; - let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in - def 64r : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>; - let Defs = [AL,AH,EFLAGS], Uses = [AX] in - def 8m : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>; - let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in - def 16m : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16; - let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in - def 32m : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32; - let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in - def 64m : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>; + let Defs = [AL, AH, EFLAGS], Uses = [AX] in + def 8r : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>; + let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in + def 16r : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16; + let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in + def 32r : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32; + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in + def 64r : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>; + let Defs = [AL, AH, EFLAGS], Uses = [AX] in + def 8m : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>; + let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in + def 16m : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16; + let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in + def 32m : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32; + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in + def 64m : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>; + + let Predicates = [In64BitMode] in { + let Defs = [AL, AH], Uses = [AX] in + def 8r_NF : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>, NF; + let Defs = [AX, DX], Uses = [AX, DX] in + def 16r_NF : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, NF, PD; + let Defs = [EAX, EDX], Uses = [EAX, EDX] in + def 32r_NF : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, NF; + let Defs = [RAX, RDX], Uses = [RAX, RDX] in + def 64r_NF : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>, NF; + let Defs = [AL, AH], Uses = [AX] in + def 8m_NF : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>, NF; + let Defs = [AX, DX], Uses = [AX, DX] in + def 16m_NF : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, NF, PD; + let Defs = [EAX, EDX], Uses = [EAX, EDX] in + def 32m_NF : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, NF; + let Defs = [RAX, RDX], Uses = [RAX, RDX] in + def 64m_NF : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, NF; + + let Defs = [AL, AH, EFLAGS], Uses = [AX] in + def 8r_EVEX : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>, PL; + let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in + def 16r_EVEX : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, PL, PD; + let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in + def 32r_EVEX : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, PL; + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in + def 64r_EVEX : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>, PL; + let Defs = [AL, AH, EFLAGS], Uses = [AX] in + def 8m_EVEX : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>, PL; + let Defs = [AX, DX, EFLAGS], Uses = [AX, DX] in + def 16m_EVEX : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, PL, PD; + let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EDX] in + def 32m_EVEX : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, PL; + let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RDX] in + def 64m_EVEX : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, PL; + } } + let hasSideEffects = 1 in { // so that we don't speculatively execute -defm DIV: Div<0xF7, "div", MRM6r, MRM6m>; -defm IDIV: Div<0xF7, "idiv", MRM7r, MRM7m>; + defm DIV: Div<0xF7, "div", MRM6r, MRM6m>; + defm IDIV: Div<0xF7, "idiv", MRM7r, MRM7m>; } -class IMulOpRR<X86TypeInfo t, X86FoldableSchedWrite sched> - : BinOpRR_RF<0xAF, "imul", t, X86smul_flag>, TB { +class IMulOpRR_R<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0> + : BinOpRR_R<0xAF, "imul", t, ndd> { let Form = MRMSrcReg; let SchedRW = [sched]; // X = IMUL Y, Z --> X = IMUL Z, Y let isCommutable = 1; } -class IMulOpRM<X86TypeInfo t, X86FoldableSchedWrite sched> - : BinOpRM_RF<0xAF, "imul", t, X86smul_flag>, TB { -let Form = MRMSrcMem; -let SchedRW = [sched.Folded, sched.ReadAfterFold]; +class IMulOpRR_RF<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0> + : BinOpRR_RF<0xAF, "imul", t, X86smul_flag, ndd> { + let Form = MRMSrcReg; + let SchedRW = [sched]; + // X = IMUL Y, Z --> X = IMUL Z, Y + let isCommutable = 1; +} +class IMulOpRM_R<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0> + : BinOpRM_R<0xAF, "imul", t, ndd> { + let Form = MRMSrcMem; + let SchedRW = [sched.Folded, sched.ReadAfterFold]; +} +class IMulOpRM_RF<X86TypeInfo t, X86FoldableSchedWrite sched, bit ndd = 0> + : BinOpRM_RF<0xAF, "imul", t, X86smul_flag, ndd> { + let Form = MRMSrcMem; + let SchedRW = [sched.Folded, sched.ReadAfterFold]; +} + +let Predicates = [NoNDD] in { + def IMUL16rr : IMulOpRR_RF<Xi16, WriteIMul16Reg>, TB, OpSize16; + def IMUL32rr : IMulOpRR_RF<Xi32, WriteIMul32Reg>, TB, OpSize32; + def IMUL64rr : IMulOpRR_RF<Xi64, WriteIMul64Reg>, TB; + def IMUL16rm : IMulOpRM_RF<Xi16, WriteIMul16Reg>, TB, OpSize16; + def IMUL32rm : IMulOpRM_RF<Xi32, WriteIMul32Reg>, TB, OpSize32; + def IMUL64rm : IMulOpRM_RF<Xi64, WriteIMul64Reg>, TB; +} +let Predicates = [HasNDD, In64BitMode] in { + def IMUL16rr_ND : IMulOpRR_RF<Xi16, WriteIMul16Reg, 1>, PD; + def IMUL32rr_ND : IMulOpRR_RF<Xi32, WriteIMul32Reg, 1>; + def IMUL64rr_ND : IMulOpRR_RF<Xi64, WriteIMul64Reg, 1>; + def IMUL16rm_ND : IMulOpRM_RF<Xi16, WriteIMul16Reg, 1>, PD; + def IMUL32rm_ND : IMulOpRM_RF<Xi32, WriteIMul32Reg, 1>; + def IMUL64rm_ND : IMulOpRM_RF<Xi64, WriteIMul64Reg, 1>; } -def IMUL16rr : IMulOpRR<Xi16, WriteIMul16Reg>, OpSize16; -def IMUL32rr : IMulOpRR<Xi32, WriteIMul32Reg>, OpSize32; -def IMUL64rr : IMulOpRR<Xi64, WriteIMul64Reg>; -def IMUL16rm : IMulOpRM<Xi16, WriteIMul16Reg>, OpSize16; -def IMUL32rm : IMulOpRM<Xi32, WriteIMul32Reg>, OpSize32; -def IMUL64rm : IMulOpRM<Xi64, WriteIMul64Reg>; +let Predicates = [In64BitMode], Pattern = [(null_frag)] in { + def IMUL16rr_NF : IMulOpRR_R<Xi16, WriteIMul16Reg>, NF, PD; + def IMUL32rr_NF : IMulOpRR_R<Xi32, WriteIMul32Reg>, NF; + def IMUL64rr_NF : IMulOpRR_R<Xi64, WriteIMul64Reg>, NF; + def IMUL16rm_NF : IMulOpRM_R<Xi16, WriteIMul16Reg>, NF, PD; + def IMUL32rm_NF : IMulOpRM_R<Xi32, WriteIMul32Reg>, NF; + def IMUL64rm_NF : IMulOpRM_R<Xi64, WriteIMul64Reg>, NF; + + def IMUL16rr_NF_ND : IMulOpRR_R<Xi16, WriteIMul16Reg, 1>, EVEX_NF, PD; + def IMUL32rr_NF_ND : IMulOpRR_R<Xi32, WriteIMul32Reg, 1>, EVEX_NF; + def IMUL64rr_NF_ND : IMulOpRR_R<Xi64, WriteIMul64Reg, 1>, EVEX_NF; + def IMUL16rm_NF_ND : IMulOpRM_R<Xi16, WriteIMul16Reg, 1>, EVEX_NF, PD; + def IMUL32rm_NF_ND : IMulOpRM_R<Xi32, WriteIMul32Reg, 1>, EVEX_NF; + def IMUL64rm_NF_ND : IMulOpRM_R<Xi64, WriteIMul64Reg, 1>, EVEX_NF; + + def IMUL16rr_EVEX : IMulOpRR_RF<Xi16, WriteIMul16Reg>, PL, PD; + def IMUL32rr_EVEX : IMulOpRR_RF<Xi32, WriteIMul32Reg>, PL; + def IMUL64rr_EVEX : IMulOpRR_RF<Xi64, WriteIMul64Reg>, PL; + def IMUL16rm_EVEX : IMulOpRM_RF<Xi16, WriteIMul16Reg>, PL, PD; + def IMUL32rm_EVEX : IMulOpRM_RF<Xi32, WriteIMul32Reg>, PL; + def IMUL64rm_EVEX : IMulOpRM_RF<Xi64, WriteIMul64Reg>, PL; +} class IMulOpRI8_R<X86TypeInfo t, X86FoldableSchedWrite sched> : BinOpRI8<0x6B, "imul", binop_ndd_args, t, MRMSrcReg, - (outs t.RegClass:$dst)>, DefEFLAGS { + (outs t.RegClass:$dst)> { let SchedRW = [sched]; } class IMulOpRI_R<X86TypeInfo t, X86FoldableSchedWrite sched> : BinOpRI<0x69, "imul", binop_ndd_args, t, MRMSrcReg, + (outs t.RegClass:$dst), []> { + let SchedRW = [sched]; +} +class IMulOpRI_RF<X86TypeInfo t, X86FoldableSchedWrite sched> + : BinOpRI<0x69, "imul", binop_ndd_args, t, MRMSrcReg, (outs t.RegClass:$dst), [(set t.RegClass:$dst, EFLAGS, (X86smul_flag t.RegClass:$src1, t.ImmNoSuOperator:$src2))]>, DefEFLAGS { let SchedRW = [sched]; } class IMulOpMI8_R<X86TypeInfo t, X86FoldableSchedWrite sched> - : BinOpMI8<"imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)>, - DefEFLAGS { + : BinOpMI8<"imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)> { let Opcode = 0x6B; let SchedRW = [sched.Folded]; } class IMulOpMI_R<X86TypeInfo t, X86FoldableSchedWrite sched> : BinOpMI<0x69, "imul", binop_ndd_args, t, MRMSrcMem, + (outs t.RegClass:$dst), []> { + let SchedRW = [sched.Folded]; +} +class IMulOpMI_RF<X86TypeInfo t, X86FoldableSchedWrite sched> + : BinOpMI<0x69, "imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst), [(set t.RegClass:$dst, EFLAGS, (X86smul_flag (t.LoadNode addr:$src1), t.ImmNoSuOperator:$src2))]>, DefEFLAGS { let SchedRW = [sched.Folded]; } -def IMUL16rri8 : IMulOpRI8_R<Xi16, WriteIMul16Imm>, OpSize16; -def IMUL32rri8 : IMulOpRI8_R<Xi32, WriteIMul32Imm>, OpSize32; -def IMUL64rri8 : IMulOpRI8_R<Xi64, WriteIMul64Imm>; -def IMUL16rri : IMulOpRI_R<Xi16, WriteIMul16Imm>, OpSize16; -def IMUL32rri : IMulOpRI_R<Xi32, WriteIMul32Imm>, OpSize32; -def IMUL64rri32 : IMulOpRI_R<Xi64, WriteIMul64Imm>; - -def IMUL16rmi8 : IMulOpMI8_R<Xi16, WriteIMul16Imm>, OpSize16; -def IMUL32rmi8 : IMulOpMI8_R<Xi32, WriteIMul32Imm>, OpSize32; -def IMUL64rmi8 : IMulOpMI8_R<Xi64, WriteIMul64Imm>; -def IMUL16rmi : IMulOpMI_R<Xi16, WriteIMul16Imm>, OpSize16; -def IMUL32rmi : IMulOpMI_R<Xi32, WriteIMul32Imm>, OpSize32; -def IMUL64rmi32 : IMulOpMI_R<Xi64, WriteIMul64Imm>; - +def IMUL16rri8 : IMulOpRI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, OpSize16; +def IMUL32rri8 : IMulOpRI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, OpSize32; +def IMUL64rri8 : IMulOpRI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS; +def IMUL16rri : IMulOpRI_RF<Xi16, WriteIMul16Imm>, OpSize16; +def IMUL32rri : IMulOpRI_RF<Xi32, WriteIMul32Imm>, OpSize32; +def IMUL64rri32 : IMulOpRI_RF<Xi64, WriteIMul64Imm>; +def IMUL16rmi8 : IMulOpMI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, OpSize16; +def IMUL32rmi8 : IMulOpMI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, OpSize32; +def IMUL64rmi8 : IMulOpMI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS; +def IMUL16rmi : IMulOpMI_RF<Xi16, WriteIMul16Imm>, OpSize16; +def IMUL32rmi : IMulOpMI_RF<Xi32, WriteIMul32Imm>, OpSize32; +def IMUL64rmi32 : IMulOpMI_RF<Xi64, WriteIMul64Imm>; + +let Predicates = [In64BitMode] in { + def IMUL16rri8_NF : IMulOpRI8_R<Xi16, WriteIMul16Imm>, NF, PD; + def IMUL32rri8_NF : IMulOpRI8_R<Xi32, WriteIMul32Imm>, NF; + def IMUL64rri8_NF : IMulOpRI8_R<Xi64, WriteIMul64Imm>, NF; + def IMUL16rri_NF : IMulOpRI_R<Xi16, WriteIMul16Imm>, NF, PD; + def IMUL32rri_NF : IMulOpRI_R<Xi32, WriteIMul32Imm>, NF; + def IMUL64rri32_NF : IMulOpRI_R<Xi64, WriteIMul64Imm>, NF; + def IMUL16rmi8_NF : IMulOpMI8_R<Xi16, WriteIMul16Imm>, NF, PD; + def IMUL32rmi8_NF : IMulOpMI8_R<Xi32, WriteIMul32Imm>, NF; + def IMUL64rmi8_NF : IMulOpMI8_R<Xi64, WriteIMul64Imm>, NF; + def IMUL16rmi_NF : IMulOpMI_R<Xi16, WriteIMul16Imm>, NF, PD; + def IMUL32rmi_NF : IMulOpMI_R<Xi32, WriteIMul32Imm>, NF; + def IMUL64rmi32_NF : IMulOpMI_R<Xi64, WriteIMul64Imm>, NF; + + def IMUL16rri8_EVEX : IMulOpRI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, PL, PD; + def IMUL32rri8_EVEX : IMulOpRI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, PL; + def IMUL64rri8_EVEX : IMulOpRI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS, PL; + def IMUL16rri_EVEX : IMulOpRI_RF<Xi16, WriteIMul16Imm>, PL, PD; + def IMUL32rri_EVEX : IMulOpRI_RF<Xi32, WriteIMul32Imm>, PL; + def IMUL64rri32_EVEX : IMulOpRI_RF<Xi64, WriteIMul64Imm>, PL; + def IMUL16rmi8_EVEX : IMulOpMI8_R<Xi16, WriteIMul16Imm>, DefEFLAGS, PL, PD; + def IMUL32rmi8_EVEX : IMulOpMI8_R<Xi32, WriteIMul32Imm>, DefEFLAGS, PL; + def IMUL64rmi8_EVEX : IMulOpMI8_R<Xi64, WriteIMul64Imm>, DefEFLAGS, PL; + def IMUL16rmi_EVEX : IMulOpMI_RF<Xi16, WriteIMul16Imm>, PL, PD; + def IMUL32rmi_EVEX : IMulOpMI_RF<Xi32, WriteIMul32Imm>, PL; + def IMUL64rmi32_EVEX : IMulOpMI_RF<Xi64, WriteIMul64Imm>, PL; +} //===----------------------------------------------------------------------===// // INC and DEC Instructions // -class IncOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag> { +class IncOpR_RF<X86TypeInfo t, bit ndd = 0> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag, ndd> { let Pattern = [(set t.RegClass:$dst, EFLAGS, (X86add_flag_nocf t.RegClass:$src1, 1))]; } -class DecOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag> { +class DecOpR_RF<X86TypeInfo t, bit ndd = 0> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag, ndd> { let Pattern = [(set t.RegClass:$dst, EFLAGS, (X86sub_flag_nocf t.RegClass:$src1, 1))]; } -class IncOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> { +class IncOpR_R<X86TypeInfo t, bit ndd = 0> : UnaryOpR_R<0xFF, MRM0r, "inc", t, null_frag, ndd>; +class DecOpR_R<X86TypeInfo t, bit ndd = 0> : UnaryOpR_R<0xFF, MRM1r, "dec", t, null_frag, ndd>; +class IncOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> { let Pattern = [(store (add (t.LoadNode addr:$src1), 1), addr:$src1), (implicit EFLAGS)]; } -class DecOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> { +class DecOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> { let Pattern = [(store (add (t.LoadNode addr:$src1), -1), addr:$src1), (implicit EFLAGS)]; } +class IncOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xFF, MRM0m, "inc", t, null_frag> { + let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), 1))]; +} +class DecOpM_RF<X86TypeInfo t> : UnaryOpM_RF<0xFF, MRM1m, "dec", t, null_frag> { + let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), -1))]; +} +class IncOpM_M<X86TypeInfo t> : UnaryOpM_M<0xFF, MRM0m, "inc", t, null_frag>; +class DecOpM_M<X86TypeInfo t> : UnaryOpM_M<0xFF, MRM1m, "dec", t, null_frag>; +class IncOpM_R<X86TypeInfo t> : UnaryOpM_R<0xFF, MRM0m, "inc", t, null_frag>; +class DecOpM_R<X86TypeInfo t> : UnaryOpM_R<0xFF, MRM1m, "dec", t, null_frag>; + // IncDec_Alt - Instructions like "inc reg" short forms. // Short forms only valid in 32-bit mode. Selected during MCInst lowering. class IncDec_Alt<bits<8> o, string m, X86TypeInfo t> : UnaryOpR_RF<o, AddRegFrm, m, t, null_frag>, Requires<[Not64BitMode]>; let isConvertibleToThreeAddress = 1 in { -def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16; -def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32; -def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16; -def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32; -def INC8r : IncOpR_RF<Xi8>; -def INC16r : IncOpR_RF<Xi16>, OpSize16; -def INC32r : IncOpR_RF<Xi32>, OpSize32; -def INC64r : IncOpR_RF<Xi64>; -def DEC8r : DecOpR_RF<Xi8>; -def DEC16r : DecOpR_RF<Xi16>, OpSize16; -def DEC32r : DecOpR_RF<Xi32>, OpSize32; -def DEC64r : DecOpR_RF<Xi64>; + def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16; + def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32; + def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16; + def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32; + let Predicates = [NoNDD] in { + def INC8r : IncOpR_RF<Xi8>; + def INC16r : IncOpR_RF<Xi16>, OpSize16; + def INC32r : IncOpR_RF<Xi32>, OpSize32; + def INC64r : IncOpR_RF<Xi64>; + def DEC8r : DecOpR_RF<Xi8>; + def DEC16r : DecOpR_RF<Xi16>, OpSize16; + def DEC32r : DecOpR_RF<Xi32>, OpSize32; + def DEC64r : DecOpR_RF<Xi64>; + } + let Predicates = [HasNDD, In64BitMode] in { + def INC8r_ND : IncOpR_RF<Xi8, 1>; + def INC16r_ND : IncOpR_RF<Xi16, 1>, PD; + def INC32r_ND : IncOpR_RF<Xi32, 1>; + def INC64r_ND : IncOpR_RF<Xi64, 1>; + def DEC8r_ND : DecOpR_RF<Xi8, 1>; + def DEC16r_ND : DecOpR_RF<Xi16, 1>, PD; + def DEC32r_ND : DecOpR_RF<Xi32, 1>; + def DEC64r_ND : DecOpR_RF<Xi64, 1>; + } + let Predicates = [In64BitMode], Pattern = [(null_frag)] in { + def INC8r_NF : IncOpR_R<Xi8>, NF; + def INC16r_NF : IncOpR_R<Xi16>, NF, PD; + def INC32r_NF : IncOpR_R<Xi32>, NF; + def INC64r_NF : IncOpR_R<Xi64>, NF; + def DEC8r_NF : DecOpR_R<Xi8>, NF; + def DEC16r_NF : DecOpR_R<Xi16>, NF, PD; + def DEC32r_NF : DecOpR_R<Xi32>, NF; + def DEC64r_NF : DecOpR_R<Xi64>, NF; + def INC8r_NF_ND : IncOpR_R<Xi8, 1>, NF; + def INC16r_NF_ND : IncOpR_R<Xi16, 1>, NF, PD; + def INC32r_NF_ND : IncOpR_R<Xi32, 1>, NF; + def INC64r_NF_ND : IncOpR_R<Xi64, 1>, NF; + def DEC8r_NF_ND : DecOpR_R<Xi8, 1>, NF; + def DEC16r_NF_ND : DecOpR_R<Xi16, 1>, NF, PD; + def DEC32r_NF_ND : DecOpR_R<Xi32, 1>, NF; + def DEC64r_NF_ND : DecOpR_R<Xi64, 1>, NF; + def INC8r_EVEX : IncOpR_RF<Xi8>, PL; + def INC16r_EVEX : IncOpR_RF<Xi16>, PL, PD; + def INC32r_EVEX : IncOpR_RF<Xi32>, PL; + def INC64r_EVEX : IncOpR_RF<Xi64>, PL; + def DEC8r_EVEX : DecOpR_RF<Xi8>, PL; + def DEC16r_EVEX : DecOpR_RF<Xi16>, PL, PD; + def DEC32r_EVEX : DecOpR_RF<Xi32>, PL; + def DEC64r_EVEX : DecOpR_RF<Xi64>, PL; + } } let Predicates = [UseIncDec] in { -def INC8m : IncOpM_M<Xi8>; -def INC16m : IncOpM_M<Xi16>, OpSize16; -def INC32m : IncOpM_M<Xi32>, OpSize32; -def DEC8m : DecOpM_M<Xi8>; -def DEC16m : DecOpM_M<Xi16>, OpSize16; -def DEC32m : DecOpM_M<Xi32>, OpSize32; + def INC8m : IncOpM_MF<Xi8>; + def INC16m : IncOpM_MF<Xi16>, OpSize16; + def INC32m : IncOpM_MF<Xi32>, OpSize32; + def DEC8m : DecOpM_MF<Xi8>; + def DEC16m : DecOpM_MF<Xi16>, OpSize16; + def DEC32m : DecOpM_MF<Xi32>, OpSize32; } let Predicates = [UseIncDec, In64BitMode] in { -def INC64m : IncOpM_M<Xi64>; -def DEC64m : DecOpM_M<Xi64>; + def INC64m : IncOpM_MF<Xi64>; + def DEC64m : DecOpM_MF<Xi64>; +} +let Predicates = [HasNDD, In64BitMode, UseIncDec] in { + def INC8m_ND : IncOpM_RF<Xi8>; + def INC16m_ND : IncOpM_RF<Xi16>, PD; + def INC32m_ND : IncOpM_RF<Xi32>; + def DEC8m_ND : DecOpM_RF<Xi8>; + def DEC16m_ND : DecOpM_RF<Xi16>, PD; + def DEC32m_ND : DecOpM_RF<Xi32>; + def INC64m_ND : IncOpM_RF<Xi64>; + def DEC64m_ND : DecOpM_RF<Xi64>; +} +let Predicates = [In64BitMode], Pattern = [(null_frag)] in { + def INC8m_NF : IncOpM_M<Xi8>, NF; + def INC16m_NF : IncOpM_M<Xi16>, NF, PD; + def INC32m_NF : IncOpM_M<Xi32>, NF; + def INC64m_NF : IncOpM_M<Xi64>, NF; + def DEC8m_NF : DecOpM_M<Xi8>, NF; + def DEC16m_NF : DecOpM_M<Xi16>, NF, PD; + def DEC32m_NF : DecOpM_M<Xi32>, NF; + def DEC64m_NF : DecOpM_M<Xi64>, NF; + def INC8m_NF_ND : IncOpM_R<Xi8>, NF; + def INC16m_NF_ND : IncOpM_R<Xi16>, NF, PD; + def INC32m_NF_ND : IncOpM_R<Xi32>, NF; + def INC64m_NF_ND : IncOpM_R<Xi64>, NF; + def DEC8m_NF_ND : DecOpM_R<Xi8>, NF; + def DEC16m_NF_ND : DecOpM_R<Xi16>, NF, PD; + def DEC32m_NF_ND : DecOpM_R<Xi32>, NF; + def DEC64m_NF_ND : DecOpM_R<Xi64>, NF; + def INC8m_EVEX : IncOpM_MF<Xi8>, PL; + def INC16m_EVEX : IncOpM_MF<Xi16>, PL, PD; + def INC32m_EVEX : IncOpM_MF<Xi32>, PL; + def INC64m_EVEX : IncOpM_MF<Xi64>, PL; + def DEC8m_EVEX : DecOpM_MF<Xi8>, PL; + def DEC16m_EVEX : DecOpM_MF<Xi16>, PL, PD; + def DEC32m_EVEX : DecOpM_MF<Xi32>, PL; + def DEC64m_EVEX : DecOpM_MF<Xi64>, PL; } //===----------------------------------------------------------------------===// @@ -350,212 +590,212 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, let isCommutable = CommutableRR, isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in { let Predicates = [NoNDD] in { - def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; - def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16; - def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32; - def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + def 8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; + def 16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16; + def 32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32; + def 64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; } let Predicates = [HasNDD, In64BitMode] in { - def NAME#8rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>; - def NAME#16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD; - def NAME#32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>; - def NAME#64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>; - def NAME#8rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF; - def NAME#16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD; - def NAME#32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF; - def NAME#64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF; + def 8rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag, 1>; + def 16rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag, 1>, PD; + def 32rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag, 1>; + def 64rr_ND : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag, 1>; + def 8rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi8, 1>, EVEX_NF; + def 16rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi16, 1>, EVEX_NF, PD; + def 32rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi32, 1>, EVEX_NF; + def 64rr_NF_ND : BinOpRR_R<BaseOpc, mnemonic, Xi64, 1>, EVEX_NF; } let Predicates = [In64BitMode] in { - def NAME#8rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF; - def NAME#16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD; - def NAME#32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF; - def NAME#64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF; - def NAME#8rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; - def NAME#16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; - def NAME#32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL; - def NAME#64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL; + def 8rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi8>, NF; + def 16rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi16>, NF, PD; + def 32rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi32>, NF; + def 64rr_NF : BinOpRR_R<BaseOpc, mnemonic, Xi64>, NF; + def 8rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; + def 16rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; + def 32rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL; + def 64rr_EVEX : BinOpRR_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL; } } - def NAME#8rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>; - def NAME#16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; - def NAME#32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; - def NAME#64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>; + def 8rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>; + def 16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; + def 32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; + def 64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>; let Predicates = [In64BitMode] in { - def NAME#8rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; - def NAME#16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; - def NAME#32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; - def NAME#64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; - def NAME#8rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; - def NAME#16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; - def NAME#32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; - def NAME#64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; - def NAME#8rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF; - def NAME#16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD; - def NAME#32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF; - def NAME#64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF; - def NAME#8rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; - def NAME#16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; - def NAME#32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; - def NAME#64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; + def 8rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; + def 16rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; + def 32rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; + def 64rr_EVEX_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; + def 8rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; + def 16rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; + def 32rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; + def 64rr_ND_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; + def 8rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8>, NF; + def 16rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16>, NF, PD; + def 32rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32>, NF; + def 64rr_NF_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64>, NF; + def 8rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; + def 16rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; + def 32rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; + def 64rr_NF_ND_REV : BinOpRR_R_Rev<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; } let Predicates = [NoNDD] in { - def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; - def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16; - def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32; - def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + def 8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; + def 16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16; + def 32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32; + def 64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; } let Predicates = [HasNDD, In64BitMode] in { - def NAME#8rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>; - def NAME#16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD; - def NAME#32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>; - def NAME#64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>; - def NAME#8rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; - def NAME#16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; - def NAME#32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; - def NAME#64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; + def 8rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag, 1>; + def 16rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag, 1>, PD; + def 32rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag, 1>; + def 64rm_ND : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag, 1>; + def 8rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi8, 1>, EVEX_NF; + def 16rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi16, 1>, EVEX_NF, PD; + def 32rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi32, 1>, EVEX_NF; + def 64rm_NF_ND : BinOpRM_R<BaseOpc2, mnemonic, Xi64, 1>, EVEX_NF; } let Predicates = [In64BitMode] in { - def NAME#8rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF; - def NAME#16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD; - def NAME#32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF; - def NAME#64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF; - def NAME#8rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL; - def NAME#16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD; - def NAME#32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL; - def NAME#64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL; + def 8rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi8>, NF; + def 16rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi16>, NF, PD; + def 32rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi32>, NF; + def 64rm_NF : BinOpRM_R<BaseOpc2, mnemonic, Xi64>, NF; + def 8rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , null_frag>, PL; + def 16rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, null_frag>, PL, PD; + def 32rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, null_frag>, PL; + def 64rm_EVEX : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, null_frag>, PL; } let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { let Predicates = [NoNDD] in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. - def NAME#16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; - def NAME#32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; - def NAME#64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>; - def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; - def NAME#16ri : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16; - def NAME#32ri : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32; - def NAME#64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>; + def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; + def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; + def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>; + def 8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + def 16ri : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16; + def 32ri : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32; + def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>; } let Predicates = [HasNDD, In64BitMode] in { - def NAME#16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; - def NAME#32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>; - def NAME#64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>; - def NAME#8ri_ND : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>; - def NAME#16ri_ND : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD; - def NAME#32ri_ND : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>; - def NAME#64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>; - def NAME#16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; - def NAME#32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; - def NAME#64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; - def NAME#8ri_NF_ND : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF; - def NAME#16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; - def NAME#32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; - def NAME#64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; + def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; + def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>; + def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>; + def 8ri_ND : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>; + def 16ri_ND : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD; + def 32ri_ND : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>; + def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>; + def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; + def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; + def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; + def 8ri_NF_ND : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF; + def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD; + def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF; + def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF; } let Predicates = [In64BitMode] in { - def NAME#16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD; - def NAME#32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF; - def NAME#64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF; - def NAME#8ri_NF : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF; - def NAME#16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD; - def NAME#32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF; - def NAME#64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF; - def NAME#16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; - def NAME#32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL; - def NAME#64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL; - def NAME#8ri_EVEX : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL; - def NAME#16ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD; - def NAME#32ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL; - def NAME#64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL; + def 16ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM>, NF, PD; + def 32ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM>, NF; + def 64ri8_NF : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM>, NF; + def 8ri_NF : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM>, NF; + def 16ri_NF : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM>, NF, PD; + def 32ri_NF : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM>, NF; + def 64ri32_NF : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM>, NF; + def 16ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; + def 32ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, PL; + def 64ri8_EVEX : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>, PL; + def 8ri_EVEX : BinOpRI_RF<0x80, mnemonic, Xi8 , null_frag, RegMRM>, PL; + def 16ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi16, null_frag, RegMRM>, PL, PD; + def 32ri_EVEX : BinOpRI_RF<0x81, mnemonic, Xi32, null_frag, RegMRM>, PL; + def 64ri32_EVEX: BinOpRI_RF<0x81, mnemonic, Xi64, null_frag, RegMRM>, PL; } } - def NAME#8mr : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16mr : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; - def NAME#32mr : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; - def NAME#64mr : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>; + def 8mr : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>; + def 16mr : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; + def 32mr : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; + def 64mr : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>; let Predicates = [HasNDD, In64BitMode] in { - def NAME#8mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; - def NAME#32mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>; - def NAME#64mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>; - def NAME#8mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF; - def NAME#16mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD; - def NAME#32mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF; - def NAME#64mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF; + def 8mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi8 , opnode>; + def 16mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; + def 32mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi32, opnode>; + def 64mr_ND : BinOpMR_RF<BaseOpc, mnemonic, Xi64, opnode>; + def 8mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi8>, EVEX_NF; + def 16mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi16>, EVEX_NF, PD; + def 32mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi32>, EVEX_NF; + def 64mr_NF_ND : BinOpMR_R<BaseOpc, mnemonic, Xi64>, EVEX_NF; } let Predicates = [In64BitMode] in { - def NAME#8mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF; - def NAME#16mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD; - def NAME#32mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF; - def NAME#64mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF; - def NAME#8mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; - def NAME#16mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; - def NAME#32mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL; - def NAME#64mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL; + def 8mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi8>, NF; + def 16mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi16>, NF, PD; + def 32mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi32>, NF; + def 64mr_NF : BinOpMR_M<BaseOpc, mnemonic, Xi64>, NF; + def 8mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; + def 16mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; + def 32mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL; + def 64mr_EVEX : BinOpMR_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL; } // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. - def NAME#16mi8 : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16; - def NAME#32mi8 : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32; + def 16mi8 : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16; + def 32mi8 : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32; let Predicates = [In64BitMode] in - def NAME#64mi8 : BinOpMI8_MF<mnemonic, Xi64, MemMRM>; - def NAME#8mi : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; - def NAME#32mi : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; + def 64mi8 : BinOpMI8_MF<mnemonic, Xi64, MemMRM>; + def 8mi : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def 16mi : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; + def 32mi : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; let Predicates = [In64BitMode] in - def NAME#64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>; + def 64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>; let Predicates = [HasNDD, In64BitMode] in { - def NAME#16mi8_ND : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD; - def NAME#32mi8_ND : BinOpMI8_RF<mnemonic, Xi32, MemMRM>; - def NAME#64mi8_ND : BinOpMI8_RF<mnemonic, Xi64, MemMRM>; - def NAME#8mi_ND : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi_ND : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD; - def NAME#32mi_ND : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>; - def NAME#64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>; - def NAME#16mi8_NF_ND : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD; - def NAME#32mi8_NF_ND : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF; - def NAME#64mi8_NF_ND : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF; - def NAME#8mi_NF_ND : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF; - def NAME#16mi_NF_ND : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD; - def NAME#32mi_NF_ND : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF; - def NAME#64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF; + def 16mi8_ND : BinOpMI8_RF<mnemonic, Xi16, MemMRM>, PD; + def 32mi8_ND : BinOpMI8_RF<mnemonic, Xi32, MemMRM>; + def 64mi8_ND : BinOpMI8_RF<mnemonic, Xi64, MemMRM>; + def 8mi_ND : BinOpMI_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def 16mi_ND : BinOpMI_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD; + def 32mi_ND : BinOpMI_RF<0x81, mnemonic, Xi32, opnode, MemMRM>; + def 64mi32_ND : BinOpMI_RF<0x81, mnemonic, Xi64, opnode, MemMRM>; + def 16mi8_NF_ND : BinOpMI8_R<mnemonic, Xi16, MemMRM>, NF, PD; + def 32mi8_NF_ND : BinOpMI8_R<mnemonic, Xi32, MemMRM>, NF; + def 64mi8_NF_ND : BinOpMI8_R<mnemonic, Xi64, MemMRM>, NF; + def 8mi_NF_ND : BinOpMI_R<0x80, mnemonic, Xi8, MemMRM>, NF; + def 16mi_NF_ND : BinOpMI_R<0x81, mnemonic, Xi16, MemMRM>, NF, PD; + def 32mi_NF_ND : BinOpMI_R<0x81, mnemonic, Xi32, MemMRM>, NF; + def 64mi32_NF_ND : BinOpMI_R<0x81, mnemonic, Xi64, MemMRM>, NF; } let Predicates = [In64BitMode] in { - def NAME#16mi8_NF : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD; - def NAME#32mi8_NF : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF; - def NAME#64mi8_NF : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF; - def NAME#8mi_NF : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF; - def NAME#16mi_NF : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD; - def NAME#32mi_NF : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF; - def NAME#64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF; - def NAME#16mi8_EVEX : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD; - def NAME#32mi8_EVEX : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL; - def NAME#64mi8_EVEX : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL; - def NAME#8mi_EVEX : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL; - def NAME#16mi_EVEX : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD; - def NAME#32mi_EVEX : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL; - def NAME#64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL; + def 16mi8_NF : BinOpMI8_M<mnemonic, Xi16, MemMRM>, NF, PD; + def 32mi8_NF : BinOpMI8_M<mnemonic, Xi32, MemMRM>, NF; + def 64mi8_NF : BinOpMI8_M<mnemonic, Xi64, MemMRM>, NF; + def 8mi_NF : BinOpMI_M<0x80, mnemonic, Xi8, MemMRM>, NF; + def 16mi_NF : BinOpMI_M<0x81, mnemonic, Xi16, MemMRM>, NF, PD; + def 32mi_NF : BinOpMI_M<0x81, mnemonic, Xi32, MemMRM>, NF; + def 64mi32_NF : BinOpMI_M<0x81, mnemonic, Xi64, MemMRM>, NF; + def 16mi8_EVEX : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, PL, PD; + def 32mi8_EVEX : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, PL; + def 64mi8_EVEX : BinOpMI8_MF<mnemonic, Xi64, MemMRM>, PL; + def 8mi_EVEX : BinOpMI_MF<0x80, mnemonic, Xi8 , null_frag, MemMRM>, PL; + def 16mi_EVEX : BinOpMI_MF<0x81, mnemonic, Xi16, null_frag, MemMRM>, PL, PD; + def 32mi_EVEX : BinOpMI_MF<0x81, mnemonic, Xi32, null_frag, MemMRM>, PL; + def 64mi32_EVEX : BinOpMI_MF<0x81, mnemonic, Xi64, null_frag, MemMRM>, PL; } // These are for the disassembler since 0x82 opcode behaves like 0x80, but // not in 64-bit mode. let Predicates = [Not64BitMode] in { - def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; - def NAME#8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; + def 8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; + def 8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; } - def NAME#8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL, + def 8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX, + def 16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16; - def NAME#32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX, + def 32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32; - def NAME#64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX, + def 64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">; } @@ -571,162 +811,162 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, bit ConvertibleToThreeAddress> { let isCommutable = CommutableRR in { let Predicates = [NoNDD] in { - def NAME#8rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>; + def 8rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def NAME#16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; - def NAME#32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; - def NAME#64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>; + def 16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; + def 32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; + def 64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>; } } let Predicates = [HasNDD, In64BitMode] in { - def NAME#8rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>; + def 8rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode, 1>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def NAME#16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD; - def NAME#32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>; - def NAME#64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>; + def 16rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode, 1>, PD; + def 32rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode, 1>; + def 64rr_ND : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode, 1>; } } } // isCommutable let Predicates = [In64BitMode] in { - def NAME#8rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; - def NAME#16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; - def NAME#32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL; - def NAME#64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL; + def 8rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; + def 16rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; + def 32rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, null_frag>, PL; + def 64rr_EVEX : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, null_frag>, PL; } - def NAME#8rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>; - def NAME#16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; - def NAME#32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; - def NAME#64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>; + def 8rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>; + def 16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; + def 32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; + def 64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>; let Predicates = [In64BitMode] in { - def NAME#8rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; - def NAME#16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; - def NAME#32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; - def NAME#64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; - def NAME#8rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; - def NAME#16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; - def NAME#32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; - def NAME#64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; + def 8rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8, 1>; + def 16rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16, 1>, PD; + def 32rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32, 1>; + def 64rr_ND_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64, 1>; + def 8rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>, PL; + def 16rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, PL, PD; + def 32rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, PL; + def 64rr_EVEX_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>, PL; } let Predicates = [NoNDD] in { - def NAME#8rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>; - def NAME#16rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16; - def NAME#32rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32; - def NAME#64rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>; + def 8rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>; + def 16rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16; + def 32rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32; + def 64rm : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>; } let Predicates = [HasNDD, In64BitMode] in { - def NAME#8rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>; - def NAME#16rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD; - def NAME#32rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>; - def NAME#64rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>; + def 8rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode, 1>; + def 16rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode, 1>, PD; + def 32rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode, 1>; + def 64rm_ND : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode, 1>; } let Predicates = [In64BitMode] in { - def NAME#8rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL; - def NAME#16rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD; - def NAME#32rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL; - def NAME#64rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL; + def 8rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>, PL; + def 16rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, PL, PD; + def 32rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, PL; + def 64rm_EVEX : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>, PL; } let Predicates = [NoNDD] in { - def NAME#8ri : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def 8ri : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. - def NAME#16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; - def NAME#32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; - def NAME#64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>; + def 16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16; + def 32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32; + def 64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>; - def NAME#16ri : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; - def NAME#32ri : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; - def NAME#64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>; + def 16ri : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; + def 32ri : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; + def 64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>; } } let Predicates = [HasNDD, In64BitMode] in { - def NAME#8ri_ND : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>; + def 8ri_ND : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM, 1>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def NAME#16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; - def NAME#32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>; - def NAME#64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>; - def NAME#16ri_ND : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD; - def NAME#32ri_ND : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>; - def NAME#64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>; + def 16ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD; + def 32ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM, 1>; + def 64ri8_ND : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM, 1>; + def 16ri_ND : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM, 1>, PD; + def 32ri_ND : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM, 1>; + def 64ri32_ND: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM, 1>; } } let Predicates = [In64BitMode] in { - def NAME#8ri_EVEX : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL; - def NAME#16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; - def NAME#32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL; - def NAME#64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL; - def NAME#16ri_EVEX : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD; - def NAME#32ri_EVEX : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL; - def NAME#64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL; + def 8ri_EVEX : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>, PL; + def 16ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, PL, PD; + def 32ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, PL; + def 64ri8_EVEX : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>, PL; + def 16ri_EVEX : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, PL, PD; + def 32ri_EVEX : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, PL; + def 64ri32_EVEX: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>, PL; } - def NAME#8mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; - def NAME#32mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; - def NAME#64mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>; + def 8mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>; + def 16mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; + def 32mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; + def 64mr : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>; let Predicates = [HasNDD, In64BitMode] in { - def NAME#8mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; - def NAME#32mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>; - def NAME#64mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>; + def 8mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi8 , opnode>; + def 16mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi16, opnode>, PD; + def 32mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi32, opnode>; + def 64mr_ND : BinOpMRF_RF<BaseOpc, mnemonic, Xi64, opnode>; } let Predicates = [In64BitMode] in { - def NAME#8mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; - def NAME#16mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; - def NAME#32mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL; - def NAME#64mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL; + def 8mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , null_frag>, PL; + def 16mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, null_frag>, PL, PD; + def 32mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, null_frag>, PL; + def 64mr_EVEX : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, null_frag>, PL; } // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. - def NAME#8mi : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi8 : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16; - def NAME#32mi8 : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32; + def 8mi : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def 16mi8 : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16; + def 32mi8 : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32; let Predicates = [In64BitMode] in - def NAME#64mi8 : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>; - def NAME#16mi : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; - def NAME#32mi : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; + def 64mi8 : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>; + def 16mi : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; + def 32mi : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; let Predicates = [In64BitMode] in - def NAME#64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>; + def 64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>; let Predicates = [HasNDD, In64BitMode] in { - def NAME#8mi_ND : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi8_ND : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD; - def NAME#32mi8_ND : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>; - def NAME#64mi8_ND : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>; - def NAME#16mi_ND : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD; - def NAME#32mi_ND : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>; - def NAME#64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>; + def 8mi_ND : BinOpMIF_RF<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def 16mi8_ND : BinOpMI8F_RF<mnemonic, Xi16, MemMRM>, PD; + def 32mi8_ND : BinOpMI8F_RF<mnemonic, Xi32, MemMRM>; + def 64mi8_ND : BinOpMI8F_RF<mnemonic, Xi64, MemMRM>; + def 16mi_ND : BinOpMIF_RF<0x81, mnemonic, Xi16, opnode, MemMRM>, PD; + def 32mi_ND : BinOpMIF_RF<0x81, mnemonic, Xi32, opnode, MemMRM>; + def 64mi32_ND : BinOpMIF_RF<0x81, mnemonic, Xi64, opnode, MemMRM>; } let Predicates = [In64BitMode] in { - def NAME#8mi_EVEX : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL; - def NAME#16mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD; - def NAME#32mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL; - def NAME#64mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL; - def NAME#16mi_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD; - def NAME#32mi_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL; - def NAME#64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL; + def 8mi_EVEX : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>, PL; + def 16mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, PL, PD; + def 32mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, PL; + def 64mi8_EVEX : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>, PL; + def 16mi_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, PL, PD; + def 32mi_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, PL; + def 64mi32_EVEX : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>, PL; } // These are for the disassembler since 0x82 opcode behaves like 0x80, but // not in 64-bit mode. let Predicates = [Not64BitMode] in { - def NAME#8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; - def NAME#8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; + def 8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; + def 8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly; } - def NAME#8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL, + def 8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX, + def 16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16; - def NAME#32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX, + def 32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32; - def NAME#64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX, + def 64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">; } @@ -739,71 +979,71 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, SDNode opnode, bit CommutableRR, bit ConvertibleToThreeAddress> { let isCommutable = CommutableRR in { - def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def 8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { - def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; - def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; - def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + def 16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; + def 32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; + def 64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; } // isConvertibleToThreeAddress } // isCommutable - def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; - def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; - def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; - def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; + def 8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>; + def 16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16; + def 32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32; + def 64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>; - def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; - def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16; - def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32; - def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; + def 8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; + def 16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16; + def 32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32; + def 64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; - def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; + def 8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. - def NAME#16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16; - def NAME#32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32; - def NAME#64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>; + def 16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16; + def 32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32; + def 64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>; - def NAME#16ri : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; - def NAME#32ri : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; - def NAME#64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>; + def 16ri : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16; + def 32ri : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32; + def 64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>; } - def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; - def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; - def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; - def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; + def 8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def 16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16; + def 32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32; + def 64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; // NOTE: These are order specific, we want the mi8 forms to be listed // first so that they are slightly preferred to the mi forms. - def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16; - def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32; + def 16mi8 : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16; + def 32mi8 : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32; let Predicates = [In64BitMode] in - def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, MemMRM>; + def 64mi8 : BinOpMI8_F<mnemonic, Xi64, MemMRM>; - def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; - def NAME#16mi : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; - def NAME#32mi : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; + def 8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def 16mi : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16; + def 32mi : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32; let Predicates = [In64BitMode] in - def NAME#64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>; + def 64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>; // These are for the disassembler since 0x82 opcode behaves like 0x80, but // not in 64-bit mode. let Predicates = [Not64BitMode] in { - def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; + def 8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly; let mayLoad = 1 in - def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>; + def 8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>; } - def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, + def 8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, + def 16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16; - def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, + def 32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32; - def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, + def 64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, "{$src, %rax|rax, $src}">; } @@ -1119,14 +1359,34 @@ defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W; // We don't have patterns for these as there is no advantage over ADC for // most code. let Form = MRMSrcReg in { -def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD; -def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD; -def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS; -def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS; + def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32>, T8, PD; + def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64>, T8, PD; + def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32>, T8, XS; + def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64>, T8, XS; + let Predicates =[In64BitMode] in { + def ADCX32rr_EVEX : BinOpRRF_RF<0x66, "adcx", Xi32>, EVEX, T_MAP4, PD; + def ADCX64rr_EVEX : BinOpRRF_RF<0x66, "adcx", Xi64>, EVEX, T_MAP4, PD; + def ADOX32rr_EVEX : BinOpRRF_RF<0x66, "adox", Xi32>, EVEX, T_MAP4, XS; + def ADOX64rr_EVEX : BinOpRRF_RF<0x66, "adox", Xi64>, EVEX, T_MAP4, XS; + def ADCX32rr_ND : BinOpRRF_RF<0x66, "adcx", Xi32, null_frag, 1>, PD; + def ADCX64rr_ND : BinOpRRF_RF<0x66, "adcx", Xi64, null_frag, 1>, PD; + def ADOX32rr_ND : BinOpRRF_RF<0x66, "adox", Xi32, null_frag, 1>, XS; + def ADOX64rr_ND : BinOpRRF_RF<0x66, "adox", Xi64, null_frag, 1>, XS; + } } let Form = MRMSrcMem in { -def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD; -def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD; -def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS; -def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS; + def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32>, T8, PD; + def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64>, T8, PD; + def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32>, T8, XS; + def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64>, T8, XS; + let Predicates =[In64BitMode] in { + def ADCX32rm_EVEX : BinOpRMF_RF<0x66, "adcx", Xi32>, EVEX, T_MAP4, PD; + def ADCX64rm_EVEX : BinOpRMF_RF<0x66, "adcx", Xi64>, EVEX, T_MAP4, PD; + def ADOX32rm_EVEX : BinOpRMF_RF<0x66, "adox", Xi32>, EVEX, T_MAP4, XS; + def ADOX64rm_EVEX : BinOpRMF_RF<0x66, "adox", Xi64>, EVEX, T_MAP4, XS; + def ADCX32rm_ND : BinOpRMF_RF<0x66, "adcx", Xi32, null_frag, 1>, PD; + def ADCX64rm_ND : BinOpRMF_RF<0x66, "adcx", Xi64, null_frag, 1>, PD; + def ADOX32rm_ND : BinOpRMF_RF<0x66, "adox", Xi32, null_frag, 1>, XS; + def ADOX64rm_ND : BinOpRMF_RF<0x66, "adox", Xi64, null_frag, 1>, XS; + } } diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td index 6e76b44b66a3..8798b13a1761 100644 --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -247,8 +247,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasREPPrefix = 0; // Does this inst have a REP prefix? bits<2> OpEncBits = OpEnc.Value; bit IgnoresW = 0; // Does this inst ignore REX_W field? - bit EVEX_W1_VEX_W0 = 0; // This EVEX inst with VEX.W==1 can become a VEX - // instruction with VEX.W == 0. bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field? bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit @@ -279,10 +277,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, CD8_EltSize, !srl(VectSize, CD8_Form{1-0}))), 0); - // Used to prevent an explicit EVEX2VEX override for this instruction. - string EVEX2VEXOverride = ?; - - bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion. ExplicitOpPrefix explicitOpPrefix = NoExplicitOpPrefix; bits<2> explicitOpPrefixBits = explicitOpPrefix.Value; // TSFlags layout should be kept in sync with X86BaseInfo.h. diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index eac8d79eb8a3..eb0734f9a618 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -29,8 +29,10 @@ class X86Subtarget; namespace X86 { enum AsmComments { + // For instr that was compressed from EVEX to LEGACY. + AC_EVEX_2_LEGACY = MachineInstr::TAsmComments, // For instr that was compressed from EVEX to VEX. - AC_EVEX_2_VEX = MachineInstr::TAsmComments + AC_EVEX_2_VEX = AC_EVEX_2_LEGACY << 1 }; /// Return a pair of condition code for the given predicate and whether diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td index 305bd74f7bd7..97c625a64cfc 100644 --- a/llvm/lib/Target/X86/X86InstrMisc.td +++ b/llvm/lib/Target/X86/X86InstrMisc.td @@ -1212,36 +1212,33 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in { (implicit EFLAGS)]>, TB, XS, Sched<[WriteTZCNTLd]>; } -multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, - RegisterClass RC, X86MemOperand x86memop, - X86FoldableSchedWrite sched, string Suffix = ""> { -let hasSideEffects = 0 in { - def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), - !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, - T8, VEX, VVVV, Sched<[sched]>; - let mayLoad = 1 in - def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), - !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, - T8, VEX, VVVV, Sched<[sched.Folded]>; -} +multiclass Bls<string m, Format RegMRM, Format MemMRM, X86TypeInfo t, string Suffix = ""> { + let SchedRW = [WriteBLS] in { + def rr#Suffix : UnaryOpR<0xF3, RegMRM, m, unaryop_ndd_args, t, + (outs t.RegClass:$dst), []>, T8, VVVV; + } + + let SchedRW = [WriteBLS.Folded] in + def rm#Suffix : UnaryOpM<0xF3, MemMRM, m, unaryop_ndd_args, t, + (outs t.RegClass:$dst), []>, T8, VVVV; } -let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in { - defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS>; - defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS>, REX_W; - defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS>; - defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS>, REX_W; - defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS>; - defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS>, REX_W; +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BLSR32 : Bls<"blsr", MRM1r, MRM1m, Xi32>, VEX; + defm BLSR64 : Bls<"blsr", MRM1r, MRM1m, Xi64>, VEX; + defm BLSMSK32 : Bls<"blsmsk", MRM2r, MRM2m, Xi32>, VEX; + defm BLSMSK64 : Bls<"blsmsk", MRM2r, MRM2m, Xi64>, VEX; + defm BLSI32 : Bls<"blsi", MRM3r, MRM3m, Xi32>, VEX; + defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64>, VEX; } -let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in { - defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX; - defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX; - defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX; - defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX; - defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem, WriteBLS, "_EVEX">, EVEX; - defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem, WriteBLS, "_EVEX">, REX_W, EVEX; +let Predicates = [HasBMI, In64BitMode], Defs = [EFLAGS] in { + defm BLSR32 : Bls<"blsr", MRM1r, MRM1m, Xi32, "_EVEX">, EVEX; + defm BLSR64 : Bls<"blsr", MRM1r, MRM1m, Xi64, "_EVEX">, EVEX; + defm BLSMSK32 : Bls<"blsmsk", MRM2r, MRM2m, Xi32, "_EVEX">, EVEX; + defm BLSMSK64 : Bls<"blsmsk", MRM2r, MRM2m, Xi64, "_EVEX">, EVEX; + defm BLSI32 : Bls<"blsi", MRM3r, MRM3m, Xi32, "_EVEX">, EVEX; + defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64, "_EVEX">, EVEX; } let Predicates = [HasBMI] in { @@ -1281,50 +1278,35 @@ let Predicates = [HasBMI] in { (BLSI64rr GR64:$src)>; } -multiclass bmi4VOp3_base<bits<8> opc, string mnemonic, RegisterClass RC, - X86MemOperand x86memop, SDPatternOperator OpNode, - PatFrag ld_frag, X86FoldableSchedWrite Sched, - string Suffix = ""> { - def rr#Suffix : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), - !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>, - T8, VEX, Sched<[Sched]>; -let mayLoad = 1 in - def rm#Suffix : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), - !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)), - (implicit EFLAGS)]>, T8, VEX, - Sched<[Sched.Folded, - // x86memop:$src1 - ReadDefault, ReadDefault, ReadDefault, ReadDefault, - ReadDefault, - // RC:$src2 - Sched.ReadAfterFold]>; +multiclass Bmi4VOp3<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, + X86FoldableSchedWrite sched, string Suffix = ""> { + let SchedRW = [sched], Form = MRMSrcReg4VOp3 in + def rr#Suffix : BinOpRR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst), + [(set t.RegClass:$dst, EFLAGS, + (node t.RegClass:$src1, t.RegClass:$src2))]>, T8; + let SchedRW = [sched.Folded, + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + sched.ReadAfterFold], Form = MRMSrcMem4VOp3 in + def rm#Suffix : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst), + [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1), + t.RegClass:$src2))]>, T8; } let Predicates = [HasBMI, NoEGPR], Defs = [EFLAGS] in { - defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem, - X86bextr, loadi32, WriteBEXTR>; - defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem, - X86bextr, loadi64, WriteBEXTR>, REX_W; + defm BEXTR32 : Bmi4VOp3<0xF7, "bextr", Xi32, X86bextr, WriteBEXTR>, VEX; + defm BEXTR64 : Bmi4VOp3<0xF7, "bextr", Xi64, X86bextr, WriteBEXTR>, VEX; } let Predicates = [HasBMI2, NoEGPR], Defs = [EFLAGS] in { - defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem, - X86bzhi, loadi32, WriteBZHI>; - defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem, - X86bzhi, loadi64, WriteBZHI>, REX_W; + defm BZHI32 : Bmi4VOp3<0xF5, "bzhi", Xi32, X86bzhi, WriteBZHI>, VEX; + defm BZHI64 : Bmi4VOp3<0xF5, "bzhi", Xi64, X86bzhi, WriteBZHI>, VEX; } -let Predicates = [HasBMI, HasEGPR], Defs = [EFLAGS] in { - defm BEXTR32 : bmi4VOp3_base<0xF7, "bextr{l}", GR32, i32mem, - X86bextr, loadi32, WriteBEXTR, "_EVEX">, EVEX; - defm BEXTR64 : bmi4VOp3_base<0xF7, "bextr{q}", GR64, i64mem, - X86bextr, loadi64, WriteBEXTR, "_EVEX">, EVEX, REX_W; +let Predicates = [HasBMI, HasEGPR, In64BitMode], Defs = [EFLAGS] in { + defm BEXTR32 : Bmi4VOp3<0xF7, "bextr", Xi32, X86bextr, WriteBEXTR, "_EVEX">, EVEX; + defm BEXTR64 : Bmi4VOp3<0xF7, "bextr", Xi64, X86bextr, WriteBEXTR, "_EVEX">, EVEX; } -let Predicates = [HasBMI2, HasEGPR], Defs = [EFLAGS] in { - defm BZHI32 : bmi4VOp3_base<0xF5, "bzhi{l}", GR32, i32mem, - X86bzhi, loadi32, WriteBZHI, "_EVEX">, EVEX; - defm BZHI64 : bmi4VOp3_base<0xF5, "bzhi{q}", GR64, i64mem, - X86bzhi, loadi64, WriteBZHI, "_EVEX">, EVEX, REX_W; +let Predicates = [HasBMI2, HasEGPR, In64BitMode], Defs = [EFLAGS] in { + defm BZHI32 : Bmi4VOp3<0xF5, "bzhi", Xi32, X86bzhi, WriteBZHI, "_EVEX">, EVEX; + defm BZHI64 : Bmi4VOp3<0xF5, "bzhi", Xi64, X86bzhi, WriteBZHI, "_EVEX">, EVEX; } def CountTrailingOnes : SDNodeXForm<imm, [{ @@ -1371,22 +1353,22 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>, - VEX, VVVV, Sched<[WriteALU]>; + NoCD8, VVVV, Sched<[WriteALU]>; def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>, - VEX, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>; + NoCD8, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>; } let Predicates = [HasBMI2, NoEGPR] in { defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, - X86pdep, loadi32>, T8, XD; + X86pdep, loadi32>, T8, XD, VEX; defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, - X86pdep, loadi64>, T8, XD, REX_W; + X86pdep, loadi64>, T8, XD, REX_W, VEX; defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, - X86pext, loadi32>, T8, XS; + X86pext, loadi32>, T8, XS, VEX; defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, - X86pext, loadi64>, T8, XS, REX_W; + X86pext, loadi64>, T8, XS, REX_W, VEX; } let Predicates = [HasBMI2, HasEGPR] in { diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 94fa6e45ded9..cb751639a057 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -8,8 +8,41 @@ def TruePredicate : Predicate<"true">; +// Intel x86 instructions have three separate encoding spaces: legacy, VEX, and +// EVEX. Not all X86 instructions are extended for EGPR. The following is an +// overview of which instructions are extended and how we implement them. +// +// * Legacy space +// All instructions in legacy maps 0 and 1 that have explicit GPR or memory +// operands can use the REX2 prefix to access the EGPR, except XSAVE*/XRSTOR. +// +// * EVEX space +// All instructions in the EVEX space can access the EGPR in their +// register/memory operands. +// +// For the above intructions, the only difference in encoding is reflected in +// the REX2/EVEX prefix when EGPR is used, i.e. the opcode and opcode name are +// unchanged. We don’t add new entries in TD, and instead we extend GPR with +// R16-R31 and make them allocatable only when the feature EGPR is available. +// +// Besides, some instructions in legacy space with map 2/3 and VEX space are +// promoted into EVEX space. Encoding space changes after the promotion, opcode +// and opcode map may change too sometimes. For these instructions, we add new +// entries in TD to avoid overcomplicating the assembler and disassembler. +// +// HasEGPR is for the new entries and NoEGPR is for the entries before +// promotion, so that the promoted variant can be selected first to benefit RA. def HasEGPR : Predicate<"Subtarget->hasEGPR()">; def NoEGPR : Predicate<"!Subtarget->hasEGPR()">; + +// APX extends some instructions with a new form that has an extra register +// operand called a new data destination (NDD). In such forms, NDD is the new +// destination register receiving the result of the computation and all other +// operands (including the original destination operand) become read-only source +// operands. +// +// HasNDD is for the new NDD entries and NoNDD is for the legacy 2-address +// entries, so that the NDD variant can be selected first to benefit RA. def HasNDD : Predicate<"Subtarget->hasNDD()">; def NoNDD : Predicate<"!Subtarget->hasNDD()">; def HasCMOV : Predicate<"Subtarget->canUseCMOV()">; diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td index d13e3b7af69a..f951894db189 100644 --- a/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -868,7 +868,7 @@ let Predicates = [HasBMI2, NoEGPR] in { defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, PD, REX_W; } -let Predicates = [HasBMI2, HasEGPR] in { +let Predicates = [HasBMI2, HasEGPR, In64BitMode] in { defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem, "_EVEX">, EVEX; defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem, "_EVEX">, REX_W, EVEX; defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8, XS, EVEX; diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td index 699e5847e63f..b1be4739617d 100644 --- a/llvm/lib/Target/X86/X86InstrSystem.td +++ b/llvm/lib/Target/X86/X86InstrSystem.td @@ -695,14 +695,14 @@ def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), Requires<[Not64BitMode, HasINVPCID]>; def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8, PD, - Requires<[In64BitMode, HasINVPCID]>; + Requires<[In64BitMode]>; def INVPCID64_EVEX : I<0xF2, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), "invpcid\t{$src2, $src1|$src1, $src2}", []>, - EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasINVPCID]>; + EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>; } // SchedRW -let Predicates = [In64BitMode, HasINVPCID] in { +let Predicates = [HasINVPCID, NoEGPR] in { // The instruction can only use a 64 bit register as the register argument // in 64 bit mode, while the intrinsic only accepts a 32 bit argument // corresponding to it. @@ -714,6 +714,13 @@ let Predicates = [In64BitMode, HasINVPCID] in { addr:$src2)>; } +let Predicates = [HasINVPCID, HasEGPR] in { + def : Pat<(int_x86_invpcid GR32:$src1, addr:$src2), + (INVPCID64_EVEX + (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src1), sub_32bit), + addr:$src2)>; +} + //===----------------------------------------------------------------------===// // SMAP Instruction diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td index da85922a018d..f4ae15837fbf 100644 --- a/llvm/lib/Target/X86/X86InstrUtils.td +++ b/llvm/lib/Target/X86/X86InstrUtils.td @@ -43,8 +43,6 @@ class XOP { Encoding OpEnc = EncXOP; } class VEX { Encoding OpEnc = EncVEX; } class EVEX { Encoding OpEnc = EncEVEX; } class WIG { bit IgnoresW = 1; } -// Special version of REX_W that can be changed to VEX.W==0 for EVEX2VEX. -class VEX_W1X { bit hasREX_W = 1; bit EVEX_W1_VEX_W0 = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } class VVVV { bit hasVEX_4V = 1; } @@ -66,9 +64,6 @@ class EVEX_CD8<int esize, CD8VForm form> { } class NoCD8 { bits<7> CD8_Scale = 0; } -class EVEX2VEXOverride<string VEXInstrName> { - string EVEX2VEXOverride = VEXInstrName; -} class AVX512BIi8Base : TB, PD { Domain ExeDomain = SSEPackedInt; ImmType ImmT = Imm8; @@ -89,7 +84,6 @@ class AVX512PDIi8Base : TB, PD { Domain ExeDomain = SSEPackedDouble; ImmType ImmT = Imm8; } -class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; } class ExplicitREX2Prefix { ExplicitOpPrefix explicitOpPrefix = ExplicitREX2; } class ExplicitVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitVEX; } class ExplicitEVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitEVEX; } @@ -1005,7 +999,7 @@ class BinOpRR_RF_Rev<bits<8> o, string m, X86TypeInfo t, bit ndd = 0> } // BinOpRRF_RF - Instructions that read "reg, reg", write "reg" and read/write // EFLAGS. -class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0> +class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0> : BinOpRR<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst), [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1, t.RegClass:$src2, @@ -1041,7 +1035,7 @@ class BinOpRM_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit (t.LoadNode addr:$src2)))]>, DefEFLAGS, NDD<ndd>; // BinOpRMF_RF - Instructions that read "reg, [mem]", write "reg" and read/write // EFLAGS. -class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0> +class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0> : BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst), [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1, (t.LoadNode addr:$src2), EFLAGS))]>, diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index e1a67f61e766..133ee2041565 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -2055,10 +2055,11 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { } } - // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that - // are compressed from EVEX encoding to VEX encoding. + // Add a comment about EVEX compression if (TM.Options.MCOptions.ShowMCEncoding) { - if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX) + if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_LEGACY) + OutStreamer->AddComment("EVEX TO LEGACY Compression ", false); + else if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX) OutStreamer->AddComment("EVEX TO VEX Compression ", false); } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 5668b514d6de..b92bffbe6239 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -75,7 +75,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeGlobalISel(PR); initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); - initializeEvexToVexInstPassPass(PR); + initializeCompressEVEXPassPass(PR); initializeFixupLEAPassPass(PR); initializeFPSPass(PR); initializeX86FixupSetCCPassPass(PR); @@ -575,7 +575,7 @@ void X86PassConfig::addPreEmitPass() { addPass(createX86FixupInstTuning()); addPass(createX86FixupVectorConstants()); } - addPass(createX86EvexToVexInsts()); + addPass(createX86CompressEVEXPass()); addPass(createX86DiscriminateMemOpsPass()); addPass(createX86InsertPrefetchPass()); addPass(createX86InsertX87waitPass()); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 49631f38017a..cd40b1d3b093 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2232,6 +2232,7 @@ InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, + { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4 { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd diff --git a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp index 05003ec304ad..1535eb622da6 100644 --- a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -142,7 +142,7 @@ void XCoreDAGToDAGISel::Select(SDNode *N) { switch (N->getOpcode()) { default: break; case ISD::Constant: { - uint64_t Val = cast<ConstantSDNode>(N)->getZExtValue(); + uint64_t Val = N->getAsZExtVal(); if (immMskBitp(N)) { // Transformation function: get the size of a mask // Look for the first non-zero bit |