diff options
Diffstat (limited to 'llvm/lib')
625 files changed, 23355 insertions, 10841 deletions
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index d030f74481cf..49199060786c 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -249,11 +249,11 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call, bool IsMustAlias = true; ModRefInfo AllArgsMask = ModRefInfo::NoModRef; if (doesAccessArgPointees(MRB)) { - for (auto AI = Call->arg_begin(), AE = Call->arg_end(); AI != AE; ++AI) { - const Value *Arg = *AI; + for (const auto &I : llvm::enumerate(Call->args())) { + const Value *Arg = I.value(); if (!Arg->getType()->isPointerTy()) continue; - unsigned ArgIdx = std::distance(Call->arg_begin(), AI); + unsigned ArgIdx = I.index(); MemoryLocation ArgLoc = MemoryLocation::getForArgument(Call, ArgIdx, TLI); AliasResult ArgAlias = alias(ArgLoc, Loc, AAQI); @@ -696,14 +696,16 @@ ModRefInfo AAResults::getModRefInfo(const Instruction *I, case Instruction::AtomicRMW: return getModRefInfo((const AtomicRMWInst *)I, Loc, AAQIP); case Instruction::Call: - return getModRefInfo((const CallInst *)I, Loc, AAQIP); + case Instruction::CallBr: case Instruction::Invoke: - return getModRefInfo((const InvokeInst *)I, Loc, AAQIP); + return getModRefInfo((const CallBase *)I, Loc, AAQIP); case Instruction::CatchPad: return getModRefInfo((const CatchPadInst *)I, Loc, AAQIP); case Instruction::CatchRet: return getModRefInfo((const CatchReturnInst *)I, Loc, AAQIP); default: + assert(!I->mayReadOrWriteMemory() && + "Unhandled memory access instruction!"); return ModRefInfo::NoModRef; } } diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp index db5167061509..177f38af13d8 100644 --- a/llvm/lib/Analysis/Analysis.cpp +++ b/llvm/lib/Analysis/Analysis.cpp @@ -35,6 +35,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeCFGOnlyPrinterLegacyPassPass(Registry); initializeCFLAndersAAWrapperPassPass(Registry); initializeCFLSteensAAWrapperPassPass(Registry); + initializeCycleInfoWrapperPassPass(Registry); initializeDependenceAnalysisWrapperPassPass(Registry); initializeDelinearizationPass(Registry); initializeDemandedBitsWrapperPassPass(Registry); diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 88b0f37b1d48..5f1bf2001d47 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -1699,6 +1699,7 @@ AliasResult BasicAAResult::aliasCheckRecursive( return Result; } else if (const GEPOperator *GV2 = dyn_cast<GEPOperator>(V2)) { AliasResult Result = aliasGEP(GV2, V2Size, V1, V1Size, O2, O1, AAQI); + Result.swap(); if (Result != AliasResult::MayAlias) return Result; } @@ -1709,6 +1710,7 @@ AliasResult BasicAAResult::aliasCheckRecursive( return Result; } else if (const PHINode *PN = dyn_cast<PHINode>(V2)) { AliasResult Result = aliasPHI(PN, V2Size, V1, V1Size, AAQI); + Result.swap(); if (Result != AliasResult::MayAlias) return Result; } @@ -1719,6 +1721,7 @@ AliasResult BasicAAResult::aliasCheckRecursive( return Result; } else if (const SelectInst *S2 = dyn_cast<SelectInst>(V2)) { AliasResult Result = aliasSelect(S2, V2Size, V1, V1Size, AAQI); + Result.swap(); if (Result != AliasResult::MayAlias) return Result; } diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index 8955658cb9e7..9b45f455be08 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -346,13 +346,16 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, if (Tracker->captured(U)) return; - // Not captured if only passed via 'nocapture' arguments. Note that - // calling a function pointer does not in itself cause the pointer to + // Calling a function pointer does not in itself cause the pointer to // be captured. This is a subtle point considering that (for example) // the callee might return its own address. It is analogous to saying // that loading a value from a pointer does not cause the pointer to be // captured, even though the loaded value might be the pointer itself // (think of self-referential objects). + if (Call->isCallee(U)) + break; + + // Not captured if only passed via 'nocapture' arguments. if (Call->isDataOperand(U) && !Call->doesNotCapture(Call->getDataOperandNo(U))) { // The parameter is not marked 'nocapture' - captured. diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 3ed3b8902343..922b38e92785 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -352,6 +352,9 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy, const DataLayout &DL) { do { Type *SrcTy = C->getType(); + if (SrcTy == DestTy) + return C; + TypeSize DestSize = DL.getTypeSizeInBits(DestTy); TypeSize SrcSize = DL.getTypeSizeInBits(SrcTy); if (!TypeSize::isKnownGE(SrcSize, DestSize)) @@ -705,7 +708,8 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty, // is all undef or zero, we know what it loads. if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C))) { if (GV->isConstant() && GV->hasDefinitiveInitializer()) { - if (GV->getInitializer()->isNullValue()) + if (GV->getInitializer()->isNullValue() && !Ty->isX86_MMXTy() && + !Ty->isX86_AMXTy()) return Constant::getNullValue(Ty); if (isa<UndefValue>(GV->getInitializer())) return UndefValue::get(Ty); @@ -881,7 +885,7 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, InnermostGEP = GEP; InBounds &= GEP->isInBounds(); - SmallVector<Value *, 4> NestedOps(GEP->op_begin() + 1, GEP->op_end()); + SmallVector<Value *, 4> NestedOps(llvm::drop_begin(GEP->operands())); // Do not try the incorporate the sub-GEP if some index is not a number. bool AllConstantInt = true; @@ -1774,15 +1778,8 @@ static bool mayFoldConstrained(ConstrainedFPIntrinsic *CI, // If the operation does not change exception status flags, it is safe // to fold. - if (St == APFloat::opStatus::opOK) { - // When FP exceptions are not ignored, intrinsic call will not be - // eliminated, because it is considered as having side effect. But we - // know that its evaluation does not raise exceptions, so side effect - // is absent. To allow removing the call, mark it as not accessing memory. - if (EB && *EB != fp::ExceptionBehavior::ebIgnore) - CI->addFnAttr(Attribute::ReadNone); + if (St == APFloat::opStatus::opOK) return true; - } // If evaluation raised FP exception, the result can depend on rounding // mode. If the latter is unknown, folding is not possible. @@ -2960,10 +2957,6 @@ static Constant *ConstantFoldFixedVectorCall( if (auto *Op = dyn_cast<ConstantInt>(Operands[0])) { unsigned Lanes = FVTy->getNumElements(); uint64_t Limit = Op->getZExtValue(); - // vctp64 are currently modelled as returning a v4i1, not a v2i1. Make - // sure we get the limit right in that case and set all relevant lanes. - if (IntrinsicID == Intrinsic::arm_mve_vctp64) - Limit *= 2; SmallVector<Constant *, 16> NCs; for (unsigned i = 0; i < Lanes; i++) { diff --git a/llvm/lib/Analysis/CycleAnalysis.cpp b/llvm/lib/Analysis/CycleAnalysis.cpp new file mode 100644 index 000000000000..09c7ee67e05c --- /dev/null +++ b/llvm/lib/Analysis/CycleAnalysis.cpp @@ -0,0 +1,77 @@ +//===- CycleAnalysis.cpp - Compute CycleInfo for LLVM IR ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/CycleAnalysis.h" +#include "llvm/ADT/GenericCycleImpl.h" +#include "llvm/IR/CFG.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +template class llvm::GenericCycleInfo<SSAContext>; +template class llvm::GenericCycle<SSAContext>; + +CycleInfo CycleAnalysis::run(Function &F, FunctionAnalysisManager &) { + CycleInfo CI; + CI.compute(F); + return CI; +} + +AnalysisKey CycleAnalysis::Key; + +CycleInfoPrinterPass::CycleInfoPrinterPass(raw_ostream &OS) : OS(OS) {} + +PreservedAnalyses CycleInfoPrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + OS << "CycleInfo for function: " << F.getName() << "\n"; + AM.getResult<CycleAnalysis>(F).print(OS); + + return PreservedAnalyses::all(); +} + +//===----------------------------------------------------------------------===// +// CycleInfoWrapperPass Implementation +//===----------------------------------------------------------------------===// +// +// The implementation details of the wrapper pass that holds a CycleInfo +// suitable for use with the legacy pass manager. +// +//===----------------------------------------------------------------------===// + +char CycleInfoWrapperPass::ID = 0; + +CycleInfoWrapperPass::CycleInfoWrapperPass() : FunctionPass(ID) { + initializeCycleInfoWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +INITIALIZE_PASS_BEGIN(CycleInfoWrapperPass, "cycles", "Cycle Info Analysis", + true, true) +INITIALIZE_PASS_END(CycleInfoWrapperPass, "cycles", "Cycle Info Analysis", true, + true) + +void CycleInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +bool CycleInfoWrapperPass::runOnFunction(Function &Func) { + CI.clear(); + + F = &Func; + CI.compute(Func); + return false; +} + +void CycleInfoWrapperPass::print(raw_ostream &OS, const Module *) const { + OS << "CycleInfo for function: " << F->getName() << "\n"; + CI.print(OS); +} + +void CycleInfoWrapperPass::releaseMemory() { + CI.clear(); + F = nullptr; +} diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp index d87fa849d839..31b2dafa29b4 100644 --- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp +++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp @@ -16,6 +16,8 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h" #include "llvm/Analysis/MLInlineAdvisor.h" +#include "llvm/Analysis/ModelUnderTrainingRunner.h" +#include "llvm/Analysis/NoInferenceModelRunner.h" #include "llvm/Analysis/Utils/TFUtils.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" @@ -94,7 +96,6 @@ struct InlineEvent { /// Because this is a protobuf, we cannot just stream the events as they come. /// Internally, TrainingLogger stores data in column-major format, because that /// lines up with how TF SequenceExample represents it. -class ModelUnderTrainingRunner; class TrainingLogger final { public: TrainingLogger(StringRef LogFileName, const ModelUnderTrainingRunner *MUTR); @@ -261,65 +262,21 @@ private: const int64_t Mandatory; }; -/// A pseudo model runner. We use it to store feature values when collecting -/// logs for the default policy, but never ask it to 'run'. -class NoInferenceModelRunner : public MLModelRunner { -public: - NoInferenceModelRunner(LLVMContext &Ctx) - : MLModelRunner(Ctx), Features(NumberOfFeatures) {} - void setFeature(FeatureIndex Index, int64_t Value) override { - Features[static_cast<int>(Index)] = Value; - } - - int64_t getFeature(int Index) const override { return Features[Index]; } - bool run() override { - llvm_unreachable("We shouldn't call run on this model runner."); - } - -private: - InlineFeatures Features; -}; - -/// ModelUnderTrainingRunner - training mode implementation. It uses TF C APIs -/// to dynamically load and evaluate a TF SavedModel -/// (https://www.tensorflow.org/guide/saved_model). Runtime performance is -/// sacrificed for ease of use while training. -class ModelUnderTrainingRunner final : public MLModelRunner { -public: - ModelUnderTrainingRunner(LLVMContext &Ctx, const std::string &ModelPath); - - bool run() override; +static const std::vector<TensorSpec> TrainingOnlyFeatures{ + TensorSpec::createSpec<int64_t>(TFFeedPrefix + "inlining_default", {1}), + TensorSpec::createSpec<float>(TFFeedPrefix + "discount", {1}), + TensorSpec::createSpec<float>(TFFeedPrefix + "reward", {1}), + TensorSpec::createSpec<int32_t>(TFFeedPrefix + "step_type", {1})}; - // Disallows copy and assign. - ModelUnderTrainingRunner(const ModelUnderTrainingRunner &) = delete; - ModelUnderTrainingRunner & - operator=(const ModelUnderTrainingRunner &) = delete; - - void setFeature(FeatureIndex Index, int64_t Value) override; - int64_t getFeature(int Index) const override; - bool isValid() const { return !!Evaluator; } - - const std::vector<LoggedFeatureSpec> &outputLoggedFeatureSpecs() const { - return OutputSpecs; - } - - const Optional<TFModelEvaluator::EvaluationResult> & - lastEvaluationResult() const { - return LastEvaluationResult; - } - -private: - std::unique_ptr<TFModelEvaluator> Evaluator; - std::vector<LoggedFeatureSpec> OutputSpecs; - Optional<TFModelEvaluator::EvaluationResult> LastEvaluationResult; +static const std::vector<TensorSpec> getInputFeatures() { + std::vector<TensorSpec> InputSpecs; + for (size_t I = 0; I < NumberOfFeatures; ++I) + InputSpecs.push_back( + TensorSpec::createSpec<int64_t>(TFFeedPrefix + FeatureNameMap[I], {1})); + append_range(InputSpecs, TrainingOnlyFeatures); + return InputSpecs; +} - // The training framework needs some additional features. - const std::vector<TensorSpec> TrainingOnlyFeatures{ - TensorSpec::createSpec<int64_t>(TFFeedPrefix + "inlining_default", {1}), - TensorSpec::createSpec<float>(TFFeedPrefix + "discount", {1}), - TensorSpec::createSpec<float>(TFFeedPrefix + "reward", {1}), - TensorSpec::createSpec<int32_t>(TFFeedPrefix + "step_type", {1})}; -}; } // namespace TrainingLogger::TrainingLogger(StringRef LogFileName, @@ -353,7 +310,7 @@ void TrainingLogger::logInlineEvent(const InlineEvent &Event, const MLModelRunner &ModelRunner) { size_t CurrentFeature = 0; for (; CurrentFeature < NumberOfFeatures; ++CurrentFeature) { - int64_t F = ModelRunner.getFeature(CurrentFeature); + int64_t F = *ModelRunner.getTensor<int64_t>(CurrentFeature); L->logInt64Value(CurrentFeature, &F); } @@ -433,7 +390,9 @@ DevelopmentModeMLInlineAdvisor::getAdviceFromModel( return MLInlineAdvisor::getAdviceFromModel(CB, ORE); bool DefaultAdvice = GetDefaultAdvice(CB); - auto Recommendation = IsDoingInference ? ModelRunner->run() : DefaultAdvice; + auto Recommendation = + IsDoingInference ? static_cast<bool>(ModelRunner->evaluate<int64_t>()) + : DefaultAdvice; return std::make_unique<LoggingMLInlineAdvice>( /*Advisor=*/this, /*CB=*/CB, /*ORE=*/ORE, /*Recommendation=*/Recommendation, @@ -458,49 +417,6 @@ size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() { return Ret; } -ModelUnderTrainingRunner::ModelUnderTrainingRunner(LLVMContext &Ctx, - const std::string &ModelPath) - : MLModelRunner(Ctx) { - std::vector<TensorSpec> InputSpecs; - for (size_t I = 0; I < NumberOfFeatures; ++I) - InputSpecs.push_back( - TensorSpec::createSpec<int64_t>(TFFeedPrefix + FeatureNameMap[I], {1})); - append_range(InputSpecs, TrainingOnlyFeatures); - if (auto MaybeOutSpecs = - loadOutputSpecs(Ctx, DecisionName, ModelPath, TFOutputSpecOverride)) - OutputSpecs = std::move(*MaybeOutSpecs); - else - return; - - Evaluator = std::make_unique<TFModelEvaluator>( - ModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I].Spec; }, - OutputSpecs.size()); - if (!Evaluator || !Evaluator->isValid()) { - Ctx.emitError("Failed to create inliner saved model evaluator"); - Evaluator.reset(); - return; - } -} - -bool ModelUnderTrainingRunner::run() { - LastEvaluationResult = Evaluator->evaluate(); - if (!LastEvaluationResult.hasValue()) { - Ctx.emitError("Error evaluating model."); - return false; - } - int64_t Decision = *LastEvaluationResult->getTensorValue<int64_t>(0); - return static_cast<bool>(Decision); -} - -int64_t ModelUnderTrainingRunner::getFeature(int Index) const { - return *Evaluator->getInput<int64_t>(Index); -} - -void ModelUnderTrainingRunner::setFeature(FeatureIndex Index, int64_t Value) { - size_t NumericIndex = static_cast<size_t>(Index); - *(Evaluator->getInput<int64_t>(NumericIndex)) = Value; -} - std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor( Module &M, ModuleAnalysisManager &MAM, std::function<bool(CallBase &)> GetDefaultAdvice) { @@ -509,10 +425,13 @@ std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor( ModelUnderTrainingRunner *MUTRPtr = nullptr; bool IsDoingInference = false; if (TFModelUnderTrainingPath.empty()) - Runner.reset(new NoInferenceModelRunner(Ctx)); + Runner.reset(new NoInferenceModelRunner(Ctx, getInputFeatures())); else { - auto MUTR = std::make_unique<ModelUnderTrainingRunner>( - Ctx, TFModelUnderTrainingPath); + std::unique_ptr<ModelUnderTrainingRunner> MUTR; + if (auto MaybeOutputSpecs = loadOutputSpecs( + Ctx, DecisionName, TFModelUnderTrainingPath, TFOutputSpecOverride)) + MUTR = std::make_unique<ModelUnderTrainingRunner>( + Ctx, TFModelUnderTrainingPath, getInputFeatures(), *MaybeOutputSpecs); if (!MUTR || !MUTR->isValid()) { Ctx.emitError("Could not load the policy model from the provided path"); return nullptr; diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index cfe910df4e91..f5fa6748d053 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -933,7 +933,7 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence( /// This function returns the identity element (or neutral element) for /// the operation K. Value *RecurrenceDescriptor::getRecurrenceIdentity(RecurKind K, Type *Tp, - FastMathFlags FMF) { + FastMathFlags FMF) const { switch (K) { case RecurKind::Xor: case RecurKind::Add: diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp index 73d1eff1b968..140c88eb8b0d 100644 --- a/llvm/lib/Analysis/InlineAdvisor.cpp +++ b/llvm/lib/Analysis/InlineAdvisor.cpp @@ -40,6 +40,10 @@ static cl::opt<bool> " callsites processed by inliner but decided" " to be not inlined")); +static cl::opt<bool> EnableInlineDeferral("inline-deferral", cl::init(false), + cl::Hidden, + cl::desc("Enable deferred inlining")); + // An integer used to limit the cost of inline deferral. The default negative // number tells shouldBeDeferred to only take the secondary cost into account. static cl::opt<int> @@ -136,8 +140,9 @@ llvm::Optional<llvm::InlineCost> static getDefaultInlineAdvice( return getInlineCost(CB, Params, CalleeTTI, GetAssumptionCache, GetTLI, GetBFI, PSI, RemarksEnabled ? &ORE : nullptr); }; - return llvm::shouldInline(CB, GetInlineCost, ORE, - Params.EnableDeferral.getValueOr(false)); + return llvm::shouldInline( + CB, GetInlineCost, ORE, + Params.EnableDeferral.getValueOr(EnableInlineDeferral)); } std::unique_ptr<InlineAdvice> @@ -409,8 +414,6 @@ llvm::shouldInline(CallBase &CB, << "' in other contexts"; }); setInlineRemark(CB, "deferred"); - // IC does not bool() to false, so get an InlineCost that will. - // This will not be inspected to make an error message. return None; } diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 22d2ce11cc90..4831b22b1d46 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2173,6 +2173,15 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, } } + // ((X | Y) ^ X ) & ((X | Y) ^ Y) --> 0 + // ((X | Y) ^ Y ) & ((X | Y) ^ X) --> 0 + BinaryOperator *Or; + if (match(Op0, m_c_Xor(m_Value(X), + m_CombineAnd(m_BinOp(Or), + m_c_Or(m_Deferred(X), m_Value(Y))))) && + match(Op1, m_c_Xor(m_Specific(Or), m_Specific(Y)))) + return Constant::getNullValue(Op0->getType()); + return nullptr; } @@ -2198,6 +2207,18 @@ static Value *simplifyOrLogic(Value *X, Value *Y) { Value *A, *B; + // (A ^ B) | (A | B) --> A | B + // (A ^ B) | (B | A) --> B | A + if (match(X, m_Xor(m_Value(A), m_Value(B))) && + match(Y, m_c_Or(m_Specific(A), m_Specific(B)))) + return Y; + + // ~(A ^ B) | (A | B) --> -1 + // ~(A ^ B) | (B | A) --> -1 + if (match(X, m_Not(m_Xor(m_Value(A), m_Value(B)))) && + match(Y, m_c_Or(m_Specific(A), m_Specific(B)))) + return ConstantInt::getAllOnesValue(Ty); + // (A & ~B) | (A ^ B) --> A ^ B // (~B & A) | (A ^ B) --> A ^ B // (A & ~B) | (B ^ A) --> B ^ A @@ -2214,18 +2235,33 @@ static Value *simplifyOrLogic(Value *X, Value *Y) { match(Y, m_c_And(m_Specific(A), m_Specific(B)))) return X; - // (A ^ B) | (A | B) --> A | B - // (A ^ B) | (B | A) --> B | A - if (match(X, m_Xor(m_Value(A), m_Value(B))) && - match(Y, m_c_Or(m_Specific(A), m_Specific(B)))) - return Y; - - // ~(A ^ B) | (A | B) --> -1 - // ~(A ^ B) | (B | A) --> -1 - if (match(X, m_Not(m_Xor(m_Value(A), m_Value(B)))) && - match(Y, m_c_Or(m_Specific(A), m_Specific(B)))) + // (~A | B) | (A ^ B) --> -1 + // (~A | B) | (B ^ A) --> -1 + // (B | ~A) | (A ^ B) --> -1 + // (B | ~A) | (B ^ A) --> -1 + if (match(X, m_c_Or(m_Not(m_Value(A)), m_Value(B))) && + match(Y, m_c_Xor(m_Specific(A), m_Specific(B)))) return ConstantInt::getAllOnesValue(Ty); + // (~A & B) | ~(A | B) --> ~A + // (~A & B) | ~(B | A) --> ~A + // (B & ~A) | ~(A | B) --> ~A + // (B & ~A) | ~(B | A) --> ~A + Value *NotA; + if (match(X, + m_c_And(m_CombineAnd(m_Value(NotA), m_NotForbidUndef(m_Value(A))), + m_Value(B))) && + match(Y, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) + return NotA; + + // ~(A ^ B) | (A & B) --> ~(A & B) + // ~(A ^ B) | (B & A) --> ~(A & B) + Value *NotAB; + if (match(X, m_CombineAnd(m_NotForbidUndef(m_Xor(m_Value(A), m_Value(B))), + m_Value(NotAB))) && + match(Y, m_c_And(m_Specific(A), m_Specific(B)))) + return NotAB; + return nullptr; } @@ -2259,27 +2295,6 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, if (Value *V = simplifyLogicOfAddSub(Op0, Op1, Instruction::Or)) return V; - Value *A, *B, *NotA; - - // (~A & B) | ~(A | B) --> ~A - // (~A & B) | ~(B | A) --> ~A - // (B & ~A) | ~(A | B) --> ~A - // (B & ~A) | ~(B | A) --> ~A - if (match(Op0, m_c_And(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))), - m_Value(B))) && - match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) - return NotA; - - // Commute the 'or' operands. - // ~(A | B) | (~A & B) --> ~A - // ~(B | A) | (~A & B) --> ~A - // ~(A | B) | (B & ~A) --> ~A - // ~(B | A) | (B & ~A) --> ~A - if (match(Op1, m_c_And(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))), - m_Value(B))) && - match(Op0, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))) - return NotA; - // Rotated -1 is still -1: // (-1 << X) | (-1 >> (C - X)) --> -1 // (-1 >> X) | (-1 << (C - X)) --> -1 @@ -2335,6 +2350,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, } // (A & C1)|(B & C2) + Value *A, *B; const APInt *C1, *C2; if (match(Op0, m_And(m_Value(A), m_APInt(C1))) && match(Op1, m_And(m_Value(B), m_APInt(C2)))) { @@ -2696,9 +2712,17 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS, if (!OpTy->isIntOrIntVectorTy(1)) return nullptr; - // A boolean compared to true/false can be simplified in 14 out of the 20 - // (10 predicates * 2 constants) possible combinations. Cases not handled here - // require a 'not' of the LHS, so those must be transformed in InstCombine. + // A boolean compared to true/false can be reduced in 14 out of the 20 + // (10 predicates * 2 constants) possible combinations. The other + // 6 cases require a 'not' of the LHS. + + auto ExtractNotLHS = [](Value *V) -> Value * { + Value *X; + if (match(V, m_Not(m_Value(X)))) + return X; + return nullptr; + }; + if (match(RHS, m_Zero())) { switch (Pred) { case CmpInst::ICMP_NE: // X != 0 -> X @@ -2706,6 +2730,13 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS, case CmpInst::ICMP_SLT: // X <s 0 -> X return LHS; + case CmpInst::ICMP_EQ: // not(X) == 0 -> X != 0 -> X + case CmpInst::ICMP_ULE: // not(X) <=u 0 -> X >u 0 -> X + case CmpInst::ICMP_SGE: // not(X) >=s 0 -> X <s 0 -> X + if (Value *X = ExtractNotLHS(LHS)) + return X; + break; + case CmpInst::ICMP_ULT: // X <u 0 -> false case CmpInst::ICMP_SGT: // X >s 0 -> false return getFalse(ITy); @@ -2723,6 +2754,13 @@ static Value *simplifyICmpOfBools(CmpInst::Predicate Pred, Value *LHS, case CmpInst::ICMP_SLE: // X <=s -1 -> X return LHS; + case CmpInst::ICMP_NE: // not(X) != 1 -> X == 1 -> X + case CmpInst::ICMP_ULT: // not(X) <=u 1 -> X >=u 1 -> X + case CmpInst::ICMP_SGT: // not(X) >s 1 -> X <=s -1 -> X + if (Value *X = ExtractNotLHS(LHS)) + return X; + break; + case CmpInst::ICMP_UGT: // X >u 1 -> false case CmpInst::ICMP_SLT: // X <s -1 -> false return getFalse(ITy); @@ -5887,9 +5925,9 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) { auto Attr = Call->getFunction()->getFnAttribute(Attribute::VScaleRange); if (!Attr.isValid()) return nullptr; - unsigned VScaleMin, VScaleMax; - std::tie(VScaleMin, VScaleMax) = Attr.getVScaleRangeArgs(); - if (VScaleMin == VScaleMax && VScaleMax != 0) + unsigned VScaleMin = Attr.getVScaleRangeMin(); + Optional<unsigned> VScaleMax = Attr.getVScaleRangeMax(); + if (VScaleMax && VScaleMin == VScaleMax) return ConstantInt::get(F->getReturnType(), VScaleMin); return nullptr; } diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 19a24ac6a484..6444518dc70c 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1568,11 +1568,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout(); uint64_t TypeByteSize = DL.getTypeAllocSize(ATy); + bool HasSameSize = + DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy); uint64_t Stride = std::abs(StrideAPtr); const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist); if (!C) { - if (!isa<SCEVCouldNotCompute>(Dist) && - TypeByteSize == DL.getTypeAllocSize(BTy) && + if (!isa<SCEVCouldNotCompute>(Dist) && HasSameSize && isSafeDependenceDistance(DL, *(PSE.getSE()), *(PSE.getBackedgeTakenCount()), *Dist, Stride, TypeByteSize)) @@ -1587,7 +1588,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, int64_t Distance = Val.getSExtValue(); // Attempt to prove strided accesses independent. - if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy && + if (std::abs(Distance) > 0 && Stride > 1 && HasSameSize && areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) { LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n"); return Dependence::NoDep; @@ -1598,7 +1599,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, bool IsTrueDataDependence = (AIsWrite && !BIsWrite); if (IsTrueDataDependence && EnableForwardingConflictDetection && (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) || - ATy != BTy)) { + !HasSameSize)) { LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n"); return Dependence::ForwardButPreventsForwarding; } @@ -1608,21 +1609,19 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, } // Write to the same location with the same size. - // Could be improved to assert type sizes are the same (i32 == float, etc). if (Val == 0) { - if (ATy == BTy) + if (HasSameSize) return Dependence::Forward; LLVM_DEBUG( - dbgs() << "LAA: Zero dependence difference but different types\n"); + dbgs() << "LAA: Zero dependence difference but different type sizes\n"); return Dependence::Unknown; } assert(Val.isStrictlyPositive() && "Expect a positive value"); - if (ATy != BTy) { - LLVM_DEBUG( - dbgs() - << "LAA: ReadWrite-Write positive dependency with different types\n"); + if (!HasSameSize) { + LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " + "different type sizes\n"); return Dependence::Unknown; } diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp index 6fc4c42bdd71..f5a65cd2b689 100644 --- a/llvm/lib/Analysis/MLInlineAdvisor.cpp +++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp @@ -35,6 +35,21 @@ using namespace llvm; +#ifdef LLVM_HAVE_TF_AOT +#include "llvm/Analysis/ReleaseModeModelRunner.h" +// codegen-ed file +#include "InlinerSizeModel.h" // NOLINT +#include "llvm/Analysis/InlineModelFeatureMaps.h" + +std::unique_ptr<InlineAdvisor> +llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) { + auto AOTRunner = + std::make_unique<ReleaseModeModelRunner<llvm::InlinerSizeModel>>( + M.getContext(), FeatureNameMap, DecisionName); + return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner)); +} +#endif + #define DEBUG_TYPE "inline-ml" static cl::opt<float> SizeIncreaseThreshold( @@ -245,29 +260,32 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) { auto &CallerBefore = FAM.getResult<FunctionPropertiesAnalysis>(Caller); auto &CalleeBefore = FAM.getResult<FunctionPropertiesAnalysis>(Callee); - ModelRunner->setFeature(FeatureIndex::CalleeBasicBlockCount, - CalleeBefore.BasicBlockCount); - ModelRunner->setFeature(FeatureIndex::CallSiteHeight, - FunctionLevels[&Caller]); - ModelRunner->setFeature(FeatureIndex::NodeCount, NodeCount); - ModelRunner->setFeature(FeatureIndex::NrCtantParams, NrCtantParams); - ModelRunner->setFeature(FeatureIndex::EdgeCount, EdgeCount); - ModelRunner->setFeature(FeatureIndex::CallerUsers, CallerBefore.Uses); - ModelRunner->setFeature(FeatureIndex::CallerConditionallyExecutedBlocks, - CallerBefore.BlocksReachedFromConditionalInstruction); - ModelRunner->setFeature(FeatureIndex::CallerBasicBlockCount, - CallerBefore.BasicBlockCount); - ModelRunner->setFeature(FeatureIndex::CalleeConditionallyExecutedBlocks, - CalleeBefore.BlocksReachedFromConditionalInstruction); - ModelRunner->setFeature(FeatureIndex::CalleeUsers, CalleeBefore.Uses); - ModelRunner->setFeature(FeatureIndex::CostEstimate, CostEstimate); + *ModelRunner->getTensor<int64_t>(FeatureIndex::CalleeBasicBlockCount) = + CalleeBefore.BasicBlockCount; + *ModelRunner->getTensor<int64_t>(FeatureIndex::CallSiteHeight) = + FunctionLevels[&Caller]; + *ModelRunner->getTensor<int64_t>(FeatureIndex::NodeCount) = NodeCount; + *ModelRunner->getTensor<int64_t>(FeatureIndex::NrCtantParams) = NrCtantParams; + *ModelRunner->getTensor<int64_t>(FeatureIndex::EdgeCount) = EdgeCount; + *ModelRunner->getTensor<int64_t>(FeatureIndex::CallerUsers) = + CallerBefore.Uses; + *ModelRunner->getTensor<int64_t>( + FeatureIndex::CallerConditionallyExecutedBlocks) = + CallerBefore.BlocksReachedFromConditionalInstruction; + *ModelRunner->getTensor<int64_t>(FeatureIndex::CallerBasicBlockCount) = + CallerBefore.BasicBlockCount; + *ModelRunner->getTensor<int64_t>( + FeatureIndex::CalleeConditionallyExecutedBlocks) = + CalleeBefore.BlocksReachedFromConditionalInstruction; + *ModelRunner->getTensor<int64_t>(FeatureIndex::CalleeUsers) = + CalleeBefore.Uses; + *ModelRunner->getTensor<int64_t>(FeatureIndex::CostEstimate) = CostEstimate; // Add the cost features for (size_t I = 0; I < static_cast<size_t>(InlineCostFeatureIndex::NumberOfFeatures); ++I) { - ModelRunner->setFeature( - inlineCostFeatureToMlFeature(static_cast<InlineCostFeatureIndex>(I)), - CostFeatures->at(I)); + *ModelRunner->getTensor<int64_t>(inlineCostFeatureToMlFeature( + static_cast<InlineCostFeatureIndex>(I))) = CostFeatures->at(I); } return getAdviceFromModel(CB, ORE); @@ -276,7 +294,8 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) { std::unique_ptr<MLInlineAdvice> MLInlineAdvisor::getAdviceFromModel(CallBase &CB, OptimizationRemarkEmitter &ORE) { - return std::make_unique<MLInlineAdvice>(this, CB, ORE, ModelRunner->run()); + return std::make_unique<MLInlineAdvice>( + this, CB, ORE, static_cast<bool>(ModelRunner->evaluate<int64_t>())); } std::unique_ptr<InlineAdvice> MLInlineAdvisor::getMandatoryAdvice(CallBase &CB, @@ -302,7 +321,8 @@ void MLInlineAdvice::reportContextForRemark( using namespace ore; OR << NV("Callee", Callee->getName()); for (size_t I = 0; I < NumberOfFeatures; ++I) - OR << NV(FeatureNameMap[I], getAdvisor()->getModelRunner().getFeature(I)); + OR << NV(FeatureNameMap[I], + *getAdvisor()->getModelRunner().getTensor<int64_t>(I)); OR << NV("ShouldInline", isInliningRecommended()); } diff --git a/llvm/lib/Analysis/MemDerefPrinter.cpp b/llvm/lib/Analysis/MemDerefPrinter.cpp index 1b16e1a9bcb2..30937a2e4931 100644 --- a/llvm/lib/Analysis/MemDerefPrinter.cpp +++ b/llvm/lib/Analysis/MemDerefPrinter.cpp @@ -59,8 +59,8 @@ bool MemDerefPrinter::runOnFunction(Function &F) { Value *PO = LI->getPointerOperand(); if (isDereferenceablePointer(PO, LI->getType(), DL)) Deref.push_back(PO); - if (isDereferenceableAndAlignedPointer( - PO, LI->getType(), MaybeAlign(LI->getAlignment()), DL)) + if (isDereferenceableAndAlignedPointer(PO, LI->getType(), + MaybeAlign(LI->getAlign()), DL)) DerefAndAligned.insert(PO); } } @@ -94,8 +94,8 @@ PreservedAnalyses MemDerefPrinterPass::run(Function &F, Value *PO = LI->getPointerOperand(); if (isDereferenceablePointer(PO, LI->getType(), DL)) Deref.push_back(PO); - if (isDereferenceableAndAlignedPointer( - PO, LI->getType(), MaybeAlign(LI->getAlignment()), DL)) + if (isDereferenceableAndAlignedPointer(PO, LI->getType(), + MaybeAlign(LI->getAlign()), DL)) DerefAndAligned.insert(PO); } } diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp index 4f2b5b34304d..ffdd7a2cfd4b 100644 --- a/llvm/lib/Analysis/MemoryBuiltins.cpp +++ b/llvm/lib/Analysis/MemoryBuiltins.cpp @@ -592,9 +592,9 @@ STATISTIC(ObjectVisitorArgument, STATISTIC(ObjectVisitorLoad, "Number of load instructions with unsolved size and offset"); -APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Alignment) { +APInt ObjectSizeOffsetVisitor::align(APInt Size, MaybeAlign Alignment) { if (Options.RoundToAlign && Alignment) - return APInt(IntTyBits, alignTo(Size.getZExtValue(), Align(Alignment))); + return APInt(IntTyBits, alignTo(Size.getZExtValue(), Alignment)); return Size; } @@ -669,7 +669,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) { APInt Size(IntTyBits, DL.getTypeAllocSize(I.getAllocatedType())); if (!I.isArrayAllocation()) - return std::make_pair(align(Size, I.getAlignment()), Zero); + return std::make_pair(align(Size, I.getAlign()), Zero); Value *ArraySize = I.getArraySize(); if (const ConstantInt *C = dyn_cast<ConstantInt>(ArraySize)) { @@ -679,8 +679,8 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) { bool Overflow; Size = Size.umul_ov(NumElems, Overflow); - return Overflow ? unknown() : std::make_pair(align(Size, I.getAlignment()), - Zero); + return Overflow ? unknown() + : std::make_pair(align(Size, I.getAlign()), Zero); } return unknown(); } @@ -694,7 +694,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) { } APInt Size(IntTyBits, DL.getTypeAllocSize(MemoryTy)); - return std::make_pair(align(Size, A.getParamAlignment()), Zero); + return std::make_pair(align(Size, A.getParamAlign()), Zero); } SizeOffsetType ObjectSizeOffsetVisitor::visitCallBase(CallBase &CB) { @@ -800,7 +800,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){ return unknown(); APInt Size(IntTyBits, DL.getTypeAllocSize(GV.getValueType())); - return std::make_pair(align(Size, GV.getAlignment()), Zero); + return std::make_pair(align(Size, GV.getAlign()), Zero); } SizeOffsetType ObjectSizeOffsetVisitor::visitIntToPtrInst(IntToPtrInst&) { diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp index 854ba83bd34a..a877b19df866 100644 --- a/llvm/lib/Analysis/MemoryLocation.cpp +++ b/llvm/lib/Analysis/MemoryLocation.cpp @@ -101,13 +101,8 @@ MemoryLocation MemoryLocation::getForSource(const AtomicMemTransferInst *MTI) { } MemoryLocation MemoryLocation::getForSource(const AnyMemTransferInst *MTI) { - auto Size = LocationSize::afterPointer(); - if (ConstantInt *C = dyn_cast<ConstantInt>(MTI->getLength())) - Size = LocationSize::precise(C->getValue().getZExtValue()); - - // memcpy/memmove can have AA tags. For memcpy, they apply - // to both the source and the destination. - return MemoryLocation(MTI->getRawSource(), Size, MTI->getAAMetadata()); + assert(MTI->getRawSource() == MTI->getArgOperand(1)); + return getForArgument(MTI, 1, nullptr); } MemoryLocation MemoryLocation::getForDest(const MemIntrinsic *MI) { @@ -119,13 +114,47 @@ MemoryLocation MemoryLocation::getForDest(const AtomicMemIntrinsic *MI) { } MemoryLocation MemoryLocation::getForDest(const AnyMemIntrinsic *MI) { - auto Size = LocationSize::afterPointer(); - if (ConstantInt *C = dyn_cast<ConstantInt>(MI->getLength())) - Size = LocationSize::precise(C->getValue().getZExtValue()); + assert(MI->getRawDest() == MI->getArgOperand(0)); + return getForArgument(MI, 0, nullptr); +} + +Optional<MemoryLocation> +MemoryLocation::getForDest(const CallBase *CB, const TargetLibraryInfo &TLI) { + if (!CB->onlyAccessesArgMemory()) + return None; + + if (CB->hasOperandBundles()) + // TODO: remove implementation restriction + return None; + + Value *UsedV = nullptr; + Optional<unsigned> UsedIdx; + for (unsigned i = 0; i < CB->arg_size(); i++) { + if (!CB->getArgOperand(i)->getType()->isPointerTy()) + continue; + if (CB->onlyReadsMemory(i)) + continue; + if (!UsedV) { + // First potentially writing parameter + UsedV = CB->getArgOperand(i); + UsedIdx = i; + continue; + } + UsedIdx = None; + if (UsedV != CB->getArgOperand(i)) + // Can't describe writing to two distinct locations. + // TODO: This results in an inprecision when two values derived from the + // same object are passed as arguments to the same function. + return None; + } + if (!UsedV) + // We don't currently have a way to represent a "does not write" result + // and thus have to be conservative and return unknown. + return None; - // memcpy/memmove can have AA tags. For memcpy, they apply - // to both the source and the destination. - return MemoryLocation(MI->getRawDest(), Size, MI->getAAMetadata()); + if (UsedIdx) + return getForArgument(CB, *UsedIdx, &TLI); + return MemoryLocation::getBeforeOrAfter(UsedV, CB->getAAMetadata()); } MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, @@ -145,6 +174,9 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, case Intrinsic::memcpy: case Intrinsic::memcpy_inline: case Intrinsic::memmove: + case Intrinsic::memcpy_element_unordered_atomic: + case Intrinsic::memmove_element_unordered_atomic: + case Intrinsic::memset_element_unordered_atomic: assert((ArgIdx == 0 || ArgIdx == 1) && "Invalid argument index for memory intrinsic"); if (ConstantInt *LenCI = dyn_cast<ConstantInt>(II->getArgOperand(2))) @@ -204,6 +236,10 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, II->getArgOperand(1)->getType())), AATags); } + + assert( + !isa<AnyMemTransferInst>(II) && + "all memory transfer intrinsics should be handled by the switch above"); } // We can bound the aliasing properties of memset_pattern16 just as we can @@ -213,6 +249,12 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, LibFunc F; if (TLI && TLI->getLibFunc(*Call, F) && TLI->has(F)) { switch (F) { + case LibFunc_strcpy: + case LibFunc_strcat: + case LibFunc_strncat: + assert((ArgIdx == 0 || ArgIdx == 1) && "Invalid argument index for str function"); + return MemoryLocation::getAfter(Arg, AATags); + case LibFunc_memset_chk: { assert(ArgIdx == 0 && "Invalid argument index for memset_chk"); LocationSize Size = LocationSize::afterPointer(); @@ -236,10 +278,18 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, return MemoryLocation(Arg, Size, AATags); } case LibFunc_memset_pattern16: + case LibFunc_memset_pattern4: + case LibFunc_memset_pattern8: assert((ArgIdx == 0 || ArgIdx == 1) && "Invalid argument index for memset_pattern16"); - if (ArgIdx == 1) - return MemoryLocation(Arg, LocationSize::precise(16), AATags); + if (ArgIdx == 1) { + unsigned Size = 16; + if (F == LibFunc_memset_pattern4) + Size = 4; + else if (F == LibFunc_memset_pattern8) + Size = 8; + return MemoryLocation(Arg, LocationSize::precise(Size), AATags); + } if (const ConstantInt *LenCI = dyn_cast<ConstantInt>(Call->getArgOperand(2))) return MemoryLocation(Arg, LocationSize::precise(LenCI->getZExtValue()), @@ -274,7 +324,6 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call, break; }; } - // FIXME: Handle memset_pattern4 and memset_pattern8 also. return MemoryLocation::getBeforeOrAfter(Call->getArgOperand(ArgIdx), AATags); } diff --git a/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp new file mode 100644 index 000000000000..941458f648bc --- /dev/null +++ b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp @@ -0,0 +1,49 @@ +//===- ModelUnderTrainingRunner.cpp - 'development' mode runner -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of a MLModelRunner for 'development' mode, i.e. evaluation +// happens off a model that's provided from the command line and is interpreted. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Config/config.h" +#if defined(LLVM_HAVE_TF_API) + +#include "llvm/Analysis/ModelUnderTrainingRunner.h" + +using namespace llvm; + +ModelUnderTrainingRunner::ModelUnderTrainingRunner( + LLVMContext &Ctx, const std::string &ModelPath, + const std::vector<TensorSpec> &InputSpecs, + const std::vector<LoggedFeatureSpec> &OutputSpecs) + : MLModelRunner(Ctx), OutputSpecs(OutputSpecs) { + Evaluator = std::make_unique<TFModelEvaluator>( + ModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I].Spec; }, + OutputSpecs.size()); + if (!Evaluator || !Evaluator->isValid()) { + Ctx.emitError("Failed to create inliner saved model evaluator"); + Evaluator.reset(); + return; + } +} + +void *ModelUnderTrainingRunner::evaluateUntyped() { + LastEvaluationResult = Evaluator->evaluate(); + if (!LastEvaluationResult.hasValue()) { + Ctx.emitError("Error evaluating model."); + return nullptr; + } + return LastEvaluationResult->getUntypedTensorValue(0); +} + +void *ModelUnderTrainingRunner::getTensorUntyped(size_t Index) { + return Evaluator->getUntypedInput(Index); +} + +#endif // defined(LLVM_HAVE_TF_API) diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index d80814852e19..2880ca62a7f8 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -234,6 +234,18 @@ static bool isNonVolatileStore(const Instruction *I) { return false; } +// Returns true if the function definition must be unreachable. +// +// Note if this helper function returns true, `F` is guaranteed +// to be unreachable; if it returns false, `F` might still +// be unreachable but not covered by this helper function. +static bool mustBeUnreachableFunction(const Function &F) { + // A function must be unreachable if its entry block ends with an + // 'unreachable'. + assert(!F.isDeclaration()); + return isa<UnreachableInst>(F.getEntryBlock().getTerminator()); +} + static void computeFunctionSummary( ModuleSummaryIndex &Index, const Module &M, const Function &F, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, DominatorTree &DT, @@ -488,7 +500,8 @@ static void computeFunctionSummary( // Don't try to import functions with noinline attribute. F.getAttributes().hasFnAttr(Attribute::NoInline), F.hasFnAttribute(Attribute::AlwaysInline), - F.hasFnAttribute(Attribute::NoUnwind), MayThrow, HasUnknownCall}; + F.hasFnAttribute(Attribute::NoUnwind), MayThrow, HasUnknownCall, + mustBeUnreachableFunction(F)}; std::vector<FunctionSummary::ParamAccess> ParamAccesses; if (auto *SSI = GetSSICallback(F)) ParamAccesses = SSI->getParamAccesses(Index); @@ -737,7 +750,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( F->hasFnAttribute(Attribute::AlwaysInline), F->hasFnAttribute(Attribute::NoUnwind), /* MayThrow */ true, - /* HasUnknownCall */ true}, + /* HasUnknownCall */ true, + /* MustBeUnreachable */ false}, /*EntryCount=*/0, ArrayRef<ValueInfo>{}, ArrayRef<FunctionSummary::EdgeTy>{}, ArrayRef<GlobalValue::GUID>{}, diff --git a/llvm/lib/Analysis/NoInferenceModelRunner.cpp b/llvm/lib/Analysis/NoInferenceModelRunner.cpp new file mode 100644 index 000000000000..02ece6aa3900 --- /dev/null +++ b/llvm/lib/Analysis/NoInferenceModelRunner.cpp @@ -0,0 +1,33 @@ +//===- NoInferenceModelRunner.cpp - noop ML model runner ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A pseudo model runner. We use it to store feature values when collecting +// logs for the default policy, in 'development' mode, but never ask it to +// 'run'. +//===----------------------------------------------------------------------===// +#include "llvm/Config/config.h" +#if defined(LLVM_HAVE_TF_API) + +#include "llvm/Analysis/NoInferenceModelRunner.h" +#include "llvm/Analysis/Utils/TFUtils.h" + +using namespace llvm; + +NoInferenceModelRunner::NoInferenceModelRunner( + LLVMContext &Ctx, const std::vector<TensorSpec> &Inputs) + : MLModelRunner(Ctx) { + ValuesBuffer.reserve(Inputs.size()); + for (const auto &TS : Inputs) + ValuesBuffer.push_back(std::make_unique<char[]>(TS.getElementCount() * + TS.getElementByteSize())); +} + +void *NoInferenceModelRunner::getTensorUntyped(size_t Index) { + return ValuesBuffer[Index].get(); +} +#endif // defined(LLVM_HAVE_TF_API) diff --git a/llvm/lib/Analysis/ReleaseModeModelRunner.cpp b/llvm/lib/Analysis/ReleaseModeModelRunner.cpp deleted file mode 100644 index d2bf95388066..000000000000 --- a/llvm/lib/Analysis/ReleaseModeModelRunner.cpp +++ /dev/null @@ -1,90 +0,0 @@ -//===- ReleaseModeModelRunner.cpp - Fast, precompiled model runner -------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a model runner wrapping an AOT compiled ML model. -// Only inference is supported. -// -//===----------------------------------------------------------------------===// -#include "llvm/Config/config.h" -#if defined(LLVM_HAVE_TF_AOT) - -#include "llvm/Analysis/InlineModelFeatureMaps.h" -#include "llvm/Analysis/MLInlineAdvisor.h" - -// codegen-ed file -#include "InlinerSizeModel.h" // NOLINT - -#include <memory> -#include <vector> - -using namespace llvm; -namespace { - -const char FeedPrefix[] = "feed_"; -const char FetchPrefix[] = "fetch_"; - -/// MLModelRunner - production mode implementation. It uses a AOT-compiled -/// SavedModel for efficient execution. -class ReleaseModeModelRunner final : public MLModelRunner { -public: - ReleaseModeModelRunner(LLVMContext &Ctx); - virtual ~ReleaseModeModelRunner() = default; - - bool run() override; - - void setFeature(FeatureIndex Index, int64_t Value) override; - int64_t getFeature(int Index) const override; - -private: - std::vector<int32_t> FeatureIndices; - int32_t ResultIndex = -1; - std::unique_ptr<llvm::InlinerSizeModel> CompiledModel; -}; -} // namespace - -ReleaseModeModelRunner::ReleaseModeModelRunner(LLVMContext &Ctx) - : MLModelRunner(Ctx), - CompiledModel(std::make_unique<llvm::InlinerSizeModel>()) { - assert(CompiledModel && "The CompiledModel should be valid"); - - FeatureIndices.resize(NumberOfFeatures); - - for (size_t I = 0; I < NumberOfFeatures; ++I) { - const int Index = - CompiledModel->LookupArgIndex(FeedPrefix + FeatureNameMap[I]); - assert(Index >= 0 && "Cannot find Feature in inlining model"); - FeatureIndices[I] = Index; - } - - ResultIndex = - CompiledModel->LookupResultIndex(std::string(FetchPrefix) + DecisionName); - assert(ResultIndex >= 0 && "Cannot find DecisionName in inlining model"); -} - -int64_t ReleaseModeModelRunner::getFeature(int Index) const { - return *static_cast<int64_t *>( - CompiledModel->arg_data(FeatureIndices[Index])); -} - -void ReleaseModeModelRunner::setFeature(FeatureIndex Index, int64_t Value) { - *static_cast<int64_t *>(CompiledModel->arg_data( - FeatureIndices[static_cast<size_t>(Index)])) = Value; -} - -bool ReleaseModeModelRunner::run() { - CompiledModel->Run(); - return static_cast<bool>( - *static_cast<int64_t *>(CompiledModel->result_data(ResultIndex))); -} - -std::unique_ptr<InlineAdvisor> -llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) { - auto AOTRunner = std::make_unique<ReleaseModeModelRunner>(M.getContext()); - return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner)); -} -#endif // defined(LLVM_HAVE_TF_AOT) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 7dc7f9904c70..0c3f32295ae1 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -8829,11 +8829,10 @@ const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) { for (auto &LS : reverse(ValuesAtScopes[V])) if (LS.first == L) { LS.second = C; + if (!isa<SCEVConstant>(C)) + ValuesAtScopesUsers[C].push_back({L, V}); break; } - - if (!isa<SCEVConstant>(C)) - ValuesAtScopesUsers[C].push_back({L, V}); return C; } @@ -13058,11 +13057,13 @@ void ScalarEvolution::verify() const { Worklist.append(L->begin(), L->end()); } for (auto &KV : ValueExprMap) { +#ifndef NDEBUG // Check for SCEV expressions referencing invalid/deleted loops. if (auto *AR = dyn_cast<SCEVAddRecExpr>(KV.second)) { assert(ValidLoops.contains(AR->getLoop()) && "AddRec references invalid loop"); } +#endif // Check that the value is also part of the reverse map. auto It = ExprValueMap.find(KV.second); @@ -13122,7 +13123,7 @@ void ScalarEvolution::verify() const { is_contained(It->second, std::make_pair(L, Value))) continue; dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: " - << ValueAtScope << " missing in ValuesAtScopesUsers\n"; + << *ValueAtScope << " missing in ValuesAtScopesUsers\n"; std::abort(); } } @@ -13139,7 +13140,7 @@ void ScalarEvolution::verify() const { is_contained(It->second, std::make_pair(L, ValueAtScope))) continue; dbgs() << "Value: " << *Value << ", Loop: " << *L << ", ValueAtScope: " - << ValueAtScope << " missing in ValuesAtScopes\n"; + << *ValueAtScope << " missing in ValuesAtScopes\n"; std::abort(); } } @@ -13958,11 +13959,12 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) { ExprsToRewrite.push_back(LHS); } }; - // Starting at the loop predecessor, climb up the predecessor chain, as long - // as there are predecessors that can be found that have unique successors - // leading to the original header. + // First, collect conditions from dominating branches. Starting at the loop + // predecessor, climb up the predecessor chain, as long as there are + // predecessors that can be found that have unique successors leading to the + // original header. // TODO: share this logic with isLoopEntryGuardedByCond. - DenseMap<const SCEV *, const SCEV *> RewriteMap; + SmallVector<std::pair<Value *, bool>> Terms; for (std::pair<const BasicBlock *, const BasicBlock *> Pair( L->getLoopPredecessor(), L->getHeader()); Pair.first; Pair = getPredecessorWithUniqueSuccessorForBB(Pair.first)) { @@ -13972,10 +13974,20 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) { if (!LoopEntryPredicate || LoopEntryPredicate->isUnconditional()) continue; - bool EnterIfTrue = LoopEntryPredicate->getSuccessor(0) == Pair.second; + Terms.emplace_back(LoopEntryPredicate->getCondition(), + LoopEntryPredicate->getSuccessor(0) == Pair.second); + } + + // Now apply the information from the collected conditions to RewriteMap. + // Conditions are processed in reverse order, so the earliest conditions is + // processed first. This ensures the SCEVs with the shortest dependency chains + // are constructed first. + DenseMap<const SCEV *, const SCEV *> RewriteMap; + for (auto &E : reverse(Terms)) { + bool EnterIfTrue = E.second; SmallVector<Value *, 8> Worklist; SmallPtrSet<Value *, 8> Visited; - Worklist.push_back(LoopEntryPredicate->getCondition()); + Worklist.push_back(E.first); while (!Worklist.empty()) { Value *Cond = Worklist.pop_back_val(); if (!Visited.insert(Cond).second) diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 72fbd5ad3f68..02923c2c7eb1 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -238,9 +238,8 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, // e.g., x86_64-pc-windows-msvc18. bool hasPartialC99 = true; if (T.isKnownWindowsMSVCEnvironment()) { - unsigned Major, Minor, Micro; - T.getEnvironmentVersion(Major, Minor, Micro); - hasPartialC99 = (Major == 0 || Major >= 19); + VersionTuple Version = T.getEnvironmentVersion(); + hasPartialC99 = (Version.getMajor() == 0 || Version.getMajor() >= 19); } // Latest targets support C89 math functions, in part. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 5067f493f02d..6aa9a77391dc 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -982,10 +982,10 @@ bool TargetTransformInfo::areInlineCompatible(const Function *Caller, return TTIImpl->areInlineCompatible(Caller, Callee); } -bool TargetTransformInfo::areFunctionArgsABICompatible( +bool TargetTransformInfo::areTypesABICompatible( const Function *Caller, const Function *Callee, - SmallPtrSetImpl<Argument *> &Args) const { - return TTIImpl->areFunctionArgsABICompatible(Caller, Callee, Args); + const ArrayRef<Type *> &Types) const { + return TTIImpl->areTypesABICompatible(Caller, Callee, Types); } bool TargetTransformInfo::isIndexedLoadLegal(MemIndexedMode Mode, @@ -1072,8 +1072,13 @@ bool TargetTransformInfo::supportsScalableVectors() const { return TTIImpl->supportsScalableVectors(); } -bool TargetTransformInfo::hasActiveVectorLength() const { - return TTIImpl->hasActiveVectorLength(); +bool TargetTransformInfo::enableScalableVectorization() const { + return TTIImpl->enableScalableVectorization(); +} + +bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, + Align Alignment) const { + return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } InstructionCost diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 1c41c77a8cfb..fc378f97de0b 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -1154,7 +1154,7 @@ static void computeKnownBitsFromOperator(const Operator *I, // If the negate has an NSW flag we can assume the sign bit of the result // will be 0 because that makes abs(INT_MIN) undefined. if (match(RHS, m_Neg(m_Specific(LHS))) && - Q.IIQ.hasNoSignedWrap(cast<Instruction>(RHS))) + Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(RHS))) Known.Zero.setSignBit(); } @@ -1709,23 +1709,25 @@ static void computeKnownBitsFromOperator(const Operator *I, !II->getFunction()->hasFnAttribute(Attribute::VScaleRange)) break; - auto VScaleRange = II->getFunction() - ->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs(); + auto Attr = II->getFunction()->getFnAttribute(Attribute::VScaleRange); + Optional<unsigned> VScaleMax = Attr.getVScaleRangeMax(); - if (VScaleRange.second == 0) + if (!VScaleMax) break; + unsigned VScaleMin = Attr.getVScaleRangeMin(); + // If vscale min = max then we know the exact value at compile time // and hence we know the exact bits. - if (VScaleRange.first == VScaleRange.second) { - Known.One = VScaleRange.first; - Known.Zero = VScaleRange.first; + if (VScaleMin == VScaleMax) { + Known.One = VScaleMin; + Known.Zero = VScaleMin; Known.Zero.flipAllBits(); break; } - unsigned FirstZeroHighBit = 32 - countLeadingZeros(VScaleRange.second); + unsigned FirstZeroHighBit = + 32 - countLeadingZeros(VScaleMax.getValue()); if (FirstZeroHighBit < BitWidth) Known.Zero.setBitsFrom(FirstZeroHighBit); @@ -4676,8 +4678,8 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V, return false; const DataLayout &DL = LI->getModule()->getDataLayout(); return isDereferenceableAndAlignedPointer( - LI->getPointerOperand(), LI->getType(), MaybeAlign(LI->getAlignment()), - DL, CtxI, DT, TLI); + LI->getPointerOperand(), LI->getType(), MaybeAlign(LI->getAlign()), DL, + CtxI, DT, TLI); } case Instruction::Call: { auto *CI = cast<const CallInst>(Inst); @@ -4975,14 +4977,6 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly, if (ConsiderFlags && Op->hasPoisonGeneratingFlags()) return true; - // TODO: this should really be under the ConsiderFlags block, but currently - // these are not dropped by dropPoisonGeneratingFlags - if (const auto *FP = dyn_cast<FPMathOperator>(Op)) { - auto FMF = FP->getFastMathFlags(); - if (FMF.noNaNs() || FMF.noInfs()) - return true; - } - unsigned Opcode = Op->getOpcode(); // Check whether opcode is a poison/undef-generating operation diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 41fb0b9008be..e3bf41c9721b 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -733,6 +733,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(x); KEYWORD(blockaddress); KEYWORD(dso_local_equivalent); + KEYWORD(no_cfi); // Metadata types. KEYWORD(distinct); @@ -773,6 +774,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(noUnwind); KEYWORD(mayThrow); KEYWORD(hasUnknownCall); + KEYWORD(mustBeUnreachable); KEYWORD(calls); KEYWORD(callee); KEYWORD(params); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 5feabd876e3a..35c615522fe2 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -152,28 +152,28 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { FnAttrs.removeAttribute(Attribute::Alignment); } - AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs)); + AS = AS.addFnAttributes(Context, FnAttrs); Fn->setAttributes(AS); } else if (CallInst *CI = dyn_cast<CallInst>(V)) { AttributeList AS = CI->getAttributes(); AttrBuilder FnAttrs(AS.getFnAttrs()); AS = AS.removeFnAttributes(Context); FnAttrs.merge(B); - AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs)); + AS = AS.addFnAttributes(Context, FnAttrs); CI->setAttributes(AS); } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) { AttributeList AS = II->getAttributes(); AttrBuilder FnAttrs(AS.getFnAttrs()); AS = AS.removeFnAttributes(Context); FnAttrs.merge(B); - AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs)); + AS = AS.addFnAttributes(Context, FnAttrs); II->setAttributes(AS); } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(V)) { AttributeList AS = CBI->getAttributes(); AttrBuilder FnAttrs(AS.getFnAttrs()); AS = AS.removeFnAttributes(Context); FnAttrs.merge(B); - AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs)); + AS = AS.addFnAttributes(Context, FnAttrs); CBI->setAttributes(AS); } else if (auto *GV = dyn_cast<GlobalVariable>(V)) { AttrBuilder Attrs(GV->getAttributes()); @@ -1306,7 +1306,8 @@ bool LLParser::parseEnumAttribute(Attribute::AttrKind Attr, AttrBuilder &B, unsigned MinValue, MaxValue; if (parseVScaleRangeArguments(MinValue, MaxValue)) return true; - B.addVScaleRangeAttr(MinValue, MaxValue); + B.addVScaleRangeAttr(MinValue, + MaxValue > 0 ? MaxValue : Optional<unsigned>()); return false; } case Attribute::Dereferenceable: { @@ -3287,6 +3288,20 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) { return false; } + case lltok::kw_no_cfi: { + // ValID ::= 'no_cfi' @foo + Lex.Lex(); + + if (parseValID(ID, PFS)) + return true; + + if (ID.Kind != ValID::t_GlobalID && ID.Kind != ValID::t_GlobalName) + return error(ID.Loc, "expected global value name in no_cfi"); + + ID.NoCFI = true; + return false; + } + case lltok::kw_trunc: case lltok::kw_zext: case lltok::kw_sext: @@ -5267,9 +5282,13 @@ bool LLParser::convertValIDToValue(Type *Ty, ValID &ID, Value *&V, } case ValID::t_GlobalName: V = getGlobalVal(ID.StrVal, Ty, ID.Loc); + if (V && ID.NoCFI) + V = NoCFIValue::get(cast<GlobalValue>(V)); return V == nullptr; case ValID::t_GlobalID: V = getGlobalVal(ID.UIntVal, Ty, ID.Loc); + if (V && ID.NoCFI) + V = NoCFIValue::get(cast<GlobalValue>(V)); return V == nullptr; case ValID::t_APSInt: if (!Ty->isIntegerTy()) @@ -8533,6 +8552,7 @@ bool LLParser::parseFlag(unsigned &Val) { /// [',' 'noUnwind' ':' Flag]? ')' /// [',' 'mayThrow' ':' Flag]? ')' /// [',' 'hasUnknownCall' ':' Flag]? ')' +/// [',' 'mustBeUnreachable' ':' Flag]? ')' bool LLParser::parseOptionalFFlags(FunctionSummary::FFlags &FFlags) { assert(Lex.getKind() == lltok::kw_funcFlags); @@ -8599,6 +8619,12 @@ bool LLParser::parseOptionalFFlags(FunctionSummary::FFlags &FFlags) { return true; FFlags.HasUnknownCall = Val; break; + case lltok::kw_mustBeUnreachable: + Lex.Lex(); + if (parseToken(lltok::colon, "expected ':'") || parseFlag(Val)) + return true; + FFlags.MustBeUnreachable = Val; + break; default: return error(Lex.getLoc(), "expected function flag type"); } diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp index cd1d872cc219..284e469a1d2f 100644 --- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp +++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp @@ -57,11 +57,7 @@ bool MetadataVerifier::verifyArray( auto &Array = Node.getArray(); if (Size && Array.size() != *Size) return false; - for (auto &Item : Array) - if (!verifyNode(Item)) - return false; - - return true; + return llvm::all_of(Array, verifyNode); } bool MetadataVerifier::verifyEntry( diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index d7bcb0d7f575..a36b256c29b6 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -107,9 +107,9 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID, // Check to see if we have a blockinfo record for this record, with a name. if (const BitstreamBlockInfo::BlockInfo *Info = BlockInfo.getBlockInfo(BlockID)) { - for (unsigned i = 0, e = Info->RecordNames.size(); i != e; ++i) - if (Info->RecordNames[i].first == CodeID) - return Info->RecordNames[i].second.c_str(); + for (const std::pair<unsigned, std::string> &RN : Info->RecordNames) + if (RN.first == CodeID) + return RN.second.c_str(); } if (CurStreamType != LLVMIRBitstream) @@ -219,6 +219,7 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID, STRINGIFY_CODE(CST_CODE, CE_SHUFVEC_EX) STRINGIFY_CODE(CST_CODE, CE_UNOP) STRINGIFY_CODE(CST_CODE, DSO_LOCAL_EQUIVALENT) + STRINGIFY_CODE(CST_CODE, NO_CFI_VALUE) case bitc::CST_CODE_BLOCKADDRESS: return "CST_CODE_BLOCKADDRESS"; STRINGIFY_CODE(CST_CODE, DATA) @@ -646,16 +647,14 @@ void BitcodeAnalyzer::printStats(BCDumpOptions O, // Emit per-block stats. O.OS << "Per-block Summary:\n"; - for (std::map<unsigned, PerBlockIDStats>::iterator I = BlockIDStats.begin(), - E = BlockIDStats.end(); - I != E; ++I) { - O.OS << " Block ID #" << I->first; + for (const auto &Stat : BlockIDStats) { + O.OS << " Block ID #" << Stat.first; if (Optional<const char *> BlockName = - GetBlockName(I->first, BlockInfo, CurStreamType)) + GetBlockName(Stat.first, BlockInfo, CurStreamType)) O.OS << " (" << *BlockName << ")"; O.OS << ":\n"; - const PerBlockIDStats &Stats = I->second; + const PerBlockIDStats &Stats = Stat.second; O.OS << " Num Instances: " << Stats.NumInstances << "\n"; O.OS << " Total Size: "; printSize(O.OS, Stats.NumBits); @@ -694,8 +693,8 @@ void BitcodeAnalyzer::printStats(BCDumpOptions O, O.OS << "\tRecord Histogram:\n"; O.OS << "\t\t Count # Bits b/Rec % Abv Record Kind\n"; - for (unsigned i = 0, e = FreqPairs.size(); i != e; ++i) { - const PerRecordStats &RecStats = Stats.CodeFreq[FreqPairs[i].second]; + for (const auto &FreqPair : FreqPairs) { + const PerRecordStats &RecStats = Stats.CodeFreq[FreqPair.second]; O.OS << format("\t\t%7d %9lu", RecStats.NumInstances, (unsigned long)RecStats.TotalBits); @@ -714,10 +713,10 @@ void BitcodeAnalyzer::printStats(BCDumpOptions O, O.OS << " "; if (Optional<const char *> CodeName = GetCodeName( - FreqPairs[i].second, I->first, BlockInfo, CurStreamType)) + FreqPair.second, Stat.first, BlockInfo, CurStreamType)) O.OS << *CodeName << "\n"; else - O.OS << "UnknownCode" << FreqPairs[i].second << "\n"; + O.OS << "UnknownCode" << FreqPair.second << "\n"; } O.OS << "\n"; } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 993cb1de8c02..f5a878f8788a 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -488,6 +488,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer { BitcodeReaderValueList ValueList; Optional<MetadataLoader> MDLoader; std::vector<Comdat *> ComdatList; + DenseSet<GlobalObject *> ImplicitComdatObjects; SmallVector<Instruction *, 64> InstructionList; std::vector<std::pair<GlobalVariable *, unsigned>> GlobalInits; @@ -932,6 +933,7 @@ static FunctionSummary::FFlags getDecodedFFlags(uint64_t RawFlags) { Flags.NoUnwind = (RawFlags >> 6) & 0x1; Flags.MayThrow = (RawFlags >> 7) & 0x1; Flags.HasUnknownCall = (RawFlags >> 8) & 0x1; + Flags.MustBeUnreachable = (RawFlags >> 9) & 0x1; return Flags; } @@ -2037,14 +2039,8 @@ Expected<Value *> BitcodeReader::recordValue(SmallVectorImpl<uint64_t> &Record, return error("Invalid value name"); V->setName(NameStr); auto *GO = dyn_cast<GlobalObject>(V); - if (GO) { - if (GO->getComdat() == reinterpret_cast<Comdat *>(1)) { - if (TT.supportsCOMDAT()) - GO->setComdat(TheModule->getOrInsertComdat(V->getName())); - else - GO->setComdat(nullptr); - } - } + if (GO && ImplicitComdatObjects.contains(GO) && TT.supportsCOMDAT()) + GO->setComdat(TheModule->getOrInsertComdat(V->getName())); return V; } @@ -2942,6 +2938,19 @@ Error BitcodeReader::parseConstants() { V = DSOLocalEquivalent::get(GV); break; } + case bitc::CST_CODE_NO_CFI_VALUE: { + if (Record.size() < 2) + return error("Invalid record"); + Type *GVTy = getTypeByID(Record[0]); + if (!GVTy) + return error("Invalid record"); + GlobalValue *GV = dyn_cast_or_null<GlobalValue>( + ValueList.getConstantFwdRef(Record[1], GVTy)); + if (!GV) + return error("Invalid record"); + V = NoCFIValue::get(GV); + break; + } } ValueList.assignValue(V, NextCstNo); @@ -3292,7 +3301,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) { NewGV->setComdat(ComdatList[ComdatID - 1]); } } else if (hasImplicitComdat(RawLinkage)) { - NewGV->setComdat(reinterpret_cast<Comdat *>(1)); + ImplicitComdatObjects.insert(NewGV); } if (Record.size() > 12) { @@ -3426,7 +3435,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) { Func->setComdat(ComdatList[ComdatID - 1]); } } else if (hasImplicitComdat(RawLinkage)) { - Func->setComdat(reinterpret_cast<Comdat *>(1)); + ImplicitComdatObjects.insert(Func); } if (Record.size() > 13) @@ -6733,10 +6742,10 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) { // not have its own string table. A bitcode file may have multiple // string tables if it was created by binary concatenation, for example // with "llvm-cat -b". - for (auto I = F.Mods.rbegin(), E = F.Mods.rend(); I != E; ++I) { - if (!I->Strtab.empty()) + for (BitcodeModule &I : llvm::reverse(F.Mods)) { + if (!I.Strtab.empty()) break; - I->Strtab = *Strtab; + I.Strtab = *Strtab; } // Similarly, the string table is used by every preceding symbol table; // normally there will be just one unless the bitcode file was created diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index e2354c40844a..dc06bc10cf95 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -833,8 +833,7 @@ void ModuleBitcodeWriter::writeAttributeTable() { Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3); SmallVector<uint64_t, 64> Record; - for (unsigned i = 0, e = Attrs.size(); i != e; ++i) { - AttributeList AL = Attrs[i]; + for (const AttributeList &AL : Attrs) { for (unsigned i : AL.indexes()) { AttributeSet AS = AL.getAttributes(i); if (AS.hasAttributes()) @@ -1067,6 +1066,7 @@ static uint64_t getEncodedFFlags(FunctionSummary::FFlags Flags) { RawFlags |= (Flags.NoUnwind << 6); RawFlags |= (Flags.MayThrow << 7); RawFlags |= (Flags.HasUnknownCall << 8); + RawFlags |= (Flags.MustBeUnreachable << 9); return RawFlags; } @@ -2657,6 +2657,10 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal, Code = bitc::CST_CODE_DSO_LOCAL_EQUIVALENT; Record.push_back(VE.getTypeID(Equiv->getGlobalValue()->getType())); Record.push_back(VE.getValueID(Equiv->getGlobalValue())); + } else if (const auto *NC = dyn_cast<NoCFIValue>(C)) { + Code = bitc::CST_CODE_NO_CFI_VALUE; + Record.push_back(VE.getTypeID(NC->getGlobalValue()->getType())); + Record.push_back(VE.getValueID(NC->getGlobalValue())); } else { #ifndef NDEBUG C->dump(); diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp index 07e0708e68c3..df4f1a1873d7 100644 --- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -310,8 +310,7 @@ static UseListOrderStack predictUseListOrder(const Module &M) { // We want to visit the functions backward now so we can list function-local // constants in the last Function they're used in. Module-level constants // have already been visited above. - for (auto I = M.rbegin(), E = M.rend(); I != E; ++I) { - const Function &F = *I; + for (const Function &F : llvm::reverse(M)) { if (F.isDeclaration()) continue; for (const BasicBlock &BB : F) @@ -541,9 +540,8 @@ void ValueEnumerator::print(raw_ostream &OS, const ValueMapType &Map, const char *Name) const { OS << "Map Name: " << Name << "\n"; OS << "Size: " << Map.size() << "\n"; - for (ValueMapType::const_iterator I = Map.begin(), - E = Map.end(); I != E; ++I) { - const Value *V = I->first; + for (const auto &I : Map) { + const Value *V = I.first; if (V->hasName()) OS << "Value: " << V->getName(); else @@ -569,10 +567,10 @@ void ValueEnumerator::print(raw_ostream &OS, const MetadataMapType &Map, const char *Name) const { OS << "Map Name: " << Name << "\n"; OS << "Size: " << Map.size() << "\n"; - for (auto I = Map.begin(), E = Map.end(); I != E; ++I) { - const Metadata *MD = I->first; - OS << "Metadata: slot = " << I->second.ID << "\n"; - OS << "Metadata: function = " << I->second.F << "\n"; + for (const auto &I : Map) { + const Metadata *MD = I.first; + OS << "Metadata: slot = " << I.second.ID << "\n"; + OS << "Metadata: function = " << I.second.F << "\n"; MD->print(OS); OS << "\n"; } diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp index 5984063627b0..5c64622c7245 100644 --- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp @@ -561,8 +561,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( << ":\n"); std::map<unsigned, BitVector> RenameRegisterMap; unsigned SuperReg = 0; - for (unsigned i = 0, e = Regs.size(); i != e; ++i) { - unsigned Reg = Regs[i]; + for (unsigned Reg : Regs) { if ((SuperReg == 0) || TRI->isSuperRegister(SuperReg, Reg)) SuperReg = Reg; @@ -584,8 +583,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( } // All group registers should be a subreg of SuperReg. - for (unsigned i = 0, e = Regs.size(); i != e; ++i) { - unsigned Reg = Regs[i]; + for (unsigned Reg : Regs) { if (Reg == SuperReg) continue; bool IsSub = TRI->isSubRegister(SuperReg, Reg); // FIXME: remove this once PR18663 has been properly fixed. For now, @@ -646,8 +644,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters( // For each referenced group register (which must be a SuperReg or // a subregister of SuperReg), find the corresponding subregister // of NewSuperReg and make sure it is free to be renamed. - for (unsigned i = 0, e = Regs.size(); i != e; ++i) { - unsigned Reg = Regs[i]; + for (unsigned Reg : Regs) { unsigned NewReg = 0; if (Reg == SuperReg) { NewReg = NewSuperReg; diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp index 7d8a73e12d3a..7e68e5e22879 100644 --- a/llvm/lib/CodeGen/Analysis.cpp +++ b/llvm/lib/CodeGen/Analysis.cpp @@ -712,8 +712,8 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F, // The manipulations performed when we're looking through an insertvalue or // an extractvalue would happen at the front of the RetPath list, so since // we have to copy it anyway it's more efficient to create a reversed copy. - SmallVector<unsigned, 4> TmpRetPath(RetPath.rbegin(), RetPath.rend()); - SmallVector<unsigned, 4> TmpCallPath(CallPath.rbegin(), CallPath.rend()); + SmallVector<unsigned, 4> TmpRetPath(llvm::reverse(RetPath)); + SmallVector<unsigned, 4> TmpCallPath(llvm::reverse(CallPath)); // Finally, we can check whether the value produced by the tail call at this // index is compatible with the value we return. diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 828cb760b82e..533f20535655 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -180,7 +180,7 @@ Align AsmPrinter::getGVAlignment(const GlobalObject *GV, const DataLayout &DL, Alignment = InAlign; // If the GV has a specified alignment, take it into account. - const MaybeAlign GVAlign(GV->getAlignment()); + const MaybeAlign GVAlign(GV->getAlign()); if (!GVAlign) return Alignment; @@ -288,7 +288,11 @@ bool AsmPrinter::doInitialization(Module &M) { // use the directive, where it would need the same conditionalization // anyway. const Triple &Target = TM.getTargetTriple(); - OutStreamer->emitVersionForTarget(Target, M.getSDKVersion()); + Triple TVT(M.getDarwinTargetVariantTriple()); + OutStreamer->emitVersionForTarget( + Target, M.getSDKVersion(), + M.getDarwinTargetVariantTriple().empty() ? nullptr : &TVT, + M.getDarwinTargetVariantSDKVersion()); // Allow the target to emit any magic that it wants at the start of the file. emitStartOfAsmFile(M); @@ -1856,6 +1860,17 @@ bool AsmPrinter::doFinalization(Module &M) { continue; OutStreamer->emitSymbolAttribute(getSymbol(&GO), MCSA_WeakReference); } + if (shouldEmitWeakSwiftAsyncExtendedFramePointerFlags()) { + auto SymbolName = "swift_async_extendedFramePointerFlags"; + auto Global = M.getGlobalVariable(SymbolName); + if (!Global) { + auto Int8PtrTy = Type::getInt8PtrTy(M.getContext()); + Global = new GlobalVariable(M, Int8PtrTy, false, + GlobalValue::ExternalWeakLinkage, nullptr, + SymbolName); + OutStreamer->emitSymbolAttribute(getSymbol(Global), MCSA_WeakReference); + } + } } // Print aliases in topological order, that is, for each alias a = b, @@ -2502,6 +2517,9 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) { if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(CV)) return getObjFileLowering().lowerDSOLocalEquivalent(Equiv, TM); + if (const NoCFIValue *NC = dyn_cast<NoCFIValue>(CV)) + return MCSymbolRefExpr::create(getSymbol(NC->getGlobalValue()), Ctx); + const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV); if (!CE) { llvm_unreachable("Unknown constant value to lower!"); diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 85ff84484ced..d621108408f0 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -611,8 +611,8 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) { void CodeViewDebug::beginModule(Module *M) { // If module doesn't have named metadata anchors or COFF debug section // is not available, skip any debug info related stuff. - if (!M->getNamedMetadata("llvm.dbg.cu") || - !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) { + NamedMDNode *CUs = M->getNamedMetadata("llvm.dbg.cu"); + if (!CUs || !Asm->getObjFileLowering().getCOFFDebugSymbolsSection()) { Asm = nullptr; return; } @@ -622,7 +622,6 @@ void CodeViewDebug::beginModule(Module *M) { TheCPU = mapArchToCVCPUType(Triple(M->getTargetTriple()).getArch()); // Get the current source language. - NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu"); const MDNode *Node = *CUs->operands().begin(); const auto *CU = cast<DICompileUnit>(Node); @@ -650,6 +649,7 @@ void CodeViewDebug::endModule() { switchToDebugSectionForSymbol(nullptr); MCSymbol *CompilerInfo = beginCVSubsection(DebugSubsectionKind::Symbols); + emitObjName(); emitCompilerInformation(); endCVSubsection(CompilerInfo); @@ -785,6 +785,29 @@ void CodeViewDebug::emitTypeGlobalHashes() { } } +void CodeViewDebug::emitObjName() { + MCSymbol *CompilerEnd = beginSymbolRecord(SymbolKind::S_OBJNAME); + + StringRef PathRef(Asm->TM.Options.ObjectFilenameForDebug); + llvm::SmallString<256> PathStore(PathRef); + + if (PathRef.empty() || PathRef == "-") { + // Don't emit the filename if we're writing to stdout or to /dev/null. + PathRef = {}; + } else { + llvm::sys::path::remove_dots(PathStore, /*remove_dot_dot=*/true); + PathRef = PathStore; + } + + OS.AddComment("Signature"); + OS.emitIntValue(0, 4); + + OS.AddComment("Object name"); + emitNullTerminatedSymbolName(OS, PathRef); + + endSymbolRecord(CompilerEnd); +} + namespace { struct Version { int Part[4]; diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h index 6f88e15ee8fe..d1fc3cdccb20 100644 --- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h @@ -302,6 +302,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase { void emitTypeGlobalHashes(); + void emitObjName(); + void emitCompilerInformation(); void emitBuildInfo(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 0d2736178f0f..9b73f0ab2f05 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -779,7 +779,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo(); auto AddEntry = [&](const DbgValueLocEntry &Entry, - DIExpressionCursor &Cursor) { + DIExpressionCursor &Cursor) { if (Entry.isLocation()) { if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Entry.getLoc().getReg())) @@ -788,11 +788,19 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, // If there is an expression, emit raw unsigned bytes. DwarfExpr.addUnsignedConstant(Entry.getInt()); } else if (Entry.isConstantFP()) { + // DwarfExpression does not support arguments wider than 64 bits + // (see PR52584). + // TODO: Consider chunking expressions containing overly wide + // arguments into separate pointer-sized fragment expressions. APInt RawBytes = Entry.getConstantFP()->getValueAPF().bitcastToAPInt(); - DwarfExpr.addUnsignedConstant(RawBytes); + if (RawBytes.getBitWidth() > 64) + return false; + DwarfExpr.addUnsignedConstant(RawBytes.getZExtValue()); } else if (Entry.isConstantInt()) { APInt RawBytes = Entry.getConstantInt()->getValue(); - DwarfExpr.addUnsignedConstant(RawBytes); + if (RawBytes.getBitWidth() > 64) + return false; + DwarfExpr.addUnsignedConstant(RawBytes.getZExtValue()); } else if (Entry.isTargetIndexLocation()) { TargetIndexLocation Loc = Entry.getTargetIndexLocation(); // TODO TargetIndexLocation is a target-independent. Currently only the @@ -805,11 +813,12 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV, return true; }; - DwarfExpr.addExpression( - std::move(Cursor), - [&](unsigned Idx, DIExpressionCursor &Cursor) -> bool { - return AddEntry(DVal->getLocEntries()[Idx], Cursor); - }); + if (!DwarfExpr.addExpression( + std::move(Cursor), + [&](unsigned Idx, DIExpressionCursor &Cursor) -> bool { + return AddEntry(DVal->getLocEntries()[Idx], Cursor); + })) + return VariableDie; // Now attach the location information to the DIE. addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize()); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 047676d4c11e..48134f1fd774 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -1224,17 +1224,15 @@ void DwarfDebug::beginModule(Module *M) { CU.getOrCreateGlobalVariableDIE(GV, sortGlobalExprs(GVMap[GV])); } - for (auto *Ty : CUNode->getEnumTypes()) { - // The enum types array by design contains pointers to - // MDNodes rather than DIRefs. Unique them here. + for (auto *Ty : CUNode->getEnumTypes()) CU.getOrCreateTypeDIE(cast<DIType>(Ty)); - } + for (auto *Ty : CUNode->getRetainedTypes()) { // The retained types array by design contains pointers to // MDNodes rather than DIRefs. Unique them here. if (DIType *RT = dyn_cast<DIType>(Ty)) - // There is no point in force-emitting a forward declaration. - CU.getOrCreateTypeDIE(RT); + // There is no point in force-emitting a forward declaration. + CU.getOrCreateTypeDIE(RT); } // Emit imported_modules last so that the relevant context is already // available. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 6409c39e7849..37407c98e75f 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -463,15 +463,14 @@ static bool isMemoryLocation(DIExpressionCursor ExprCursor) { return true; } -void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor, - unsigned FragmentOffsetInBits) { +void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor) { addExpression(std::move(ExprCursor), [](unsigned Idx, DIExpressionCursor &Cursor) -> bool { llvm_unreachable("unhandled opcode found in expression"); }); } -void DwarfExpression::addExpression( +bool DwarfExpression::addExpression( DIExpressionCursor &&ExprCursor, llvm::function_ref<bool(unsigned, DIExpressionCursor &)> InsertArg) { // Entry values can currently only cover the initial register location, @@ -496,7 +495,7 @@ void DwarfExpression::addExpression( case dwarf::DW_OP_LLVM_arg: if (!InsertArg(Op->getArg(0), ExprCursor)) { LocationKind = Unknown; - return; + return false; } break; case dwarf::DW_OP_LLVM_fragment: { @@ -527,7 +526,7 @@ void DwarfExpression::addExpression( setSubRegisterPiece(0, 0); // Reset the location description kind. LocationKind = Unknown; - return; + return true; } case dwarf::DW_OP_plus_uconst: assert(!isRegisterLocation()); @@ -630,6 +629,8 @@ void DwarfExpression::addExpression( if (isImplicitLocation() && !isParameterValue()) // Turn this into an implicit location description. addStackValue(); + + return true; } /// add masking operations to stencil out a subregister. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h index 513e9072309e..e605fe2f7d39 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h @@ -340,16 +340,17 @@ public: /// create one if necessary. unsigned getOrCreateBaseType(unsigned BitSize, dwarf::TypeKind Encoding); + /// Emit all remaining operations in the DIExpressionCursor. The + /// cursor must not contain any DW_OP_LLVM_arg operations. + void addExpression(DIExpressionCursor &&Expr); + /// Emit all remaining operations in the DIExpressionCursor. - /// - /// \param FragmentOffsetInBits If this is one fragment out of multiple - /// locations, this is the offset of the - /// fragment inside the entire variable. - void addExpression(DIExpressionCursor &&Expr, - unsigned FragmentOffsetInBits = 0); - void - addExpression(DIExpressionCursor &&Expr, - llvm::function_ref<bool(unsigned, DIExpressionCursor &)> InsertArg); + /// DW_OP_LLVM_arg operations are resolved by calling (\p InsertArg). + // + /// \return false if any call to (\p InsertArg) returns false. + bool addExpression( + DIExpressionCursor &&Expr, + llvm::function_ref<bool(unsigned, DIExpressionCursor &)> InsertArg); /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to /// the fragment described by \c Expr. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 976e35905144..6b6d63f14f87 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -536,6 +536,18 @@ void DwarfUnit::addThrownTypes(DIE &Die, DINodeArray ThrownTypes) { } } +void DwarfUnit::addAccess(DIE &Die, DINode::DIFlags Flags) { + if ((Flags & DINode::FlagAccessibility) == DINode::FlagProtected) + addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, + dwarf::DW_ACCESS_protected); + else if ((Flags & DINode::FlagAccessibility) == DINode::FlagPrivate) + addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, + dwarf::DW_ACCESS_private); + else if ((Flags & DINode::FlagAccessibility) == DINode::FlagPublic) + addUInt(Die, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, + dwarf::DW_ACCESS_public); +} + DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) { if (!Context || isa<DIFile>(Context)) return &getUnitDie(); @@ -842,13 +854,17 @@ void DwarfUnit::addAnnotation(DIE &Buffer, DINodeArray Annotations) { for (const Metadata *Annotation : Annotations->operands()) { const MDNode *MD = cast<MDNode>(Annotation); const MDString *Name = cast<MDString>(MD->getOperand(0)); - - // Currently, only MDString is supported with btf_decl_tag attribute. - const MDString *Value = cast<MDString>(MD->getOperand(1)); + const auto &Value = MD->getOperand(1); DIE &AnnotationDie = createAndAddDIE(dwarf::DW_TAG_LLVM_annotation, Buffer); addString(AnnotationDie, dwarf::DW_AT_name, Name->getString()); - addString(AnnotationDie, dwarf::DW_AT_const_value, Value->getString()); + if (const auto *Data = dyn_cast<MDString>(Value)) + addString(AnnotationDie, dwarf::DW_AT_const_value, Data->getString()); + else if (const auto *Data = dyn_cast<ConstantAsMetadata>(Value)) + addConstantValue(AnnotationDie, Data->getValue()->getUniqueInteger(), + /*Unsigned=*/true); + else + assert(false && "Unsupported annotation value type"); } } @@ -1007,6 +1023,9 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) { if (CTy->isForwardDecl()) addFlag(Buffer, dwarf::DW_AT_declaration); + // Add accessibility info if available. + addAccess(Buffer, CTy->getFlags()); + // Add source line info if available. if (!CTy->isForwardDecl()) addSourceLine(Buffer, CTy); @@ -1308,15 +1327,7 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie, if (SP->isNoReturn()) addFlag(SPDie, dwarf::DW_AT_noreturn); - if (SP->isProtected()) - addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_protected); - else if (SP->isPrivate()) - addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_private); - else if (SP->isPublic()) - addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_public); + addAccess(SPDie, SP->getFlags()); if (SP->isExplicit()) addFlag(SPDie, dwarf::DW_AT_explicit); @@ -1666,16 +1677,8 @@ DIE &DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) { } } - if (DT->isProtected()) - addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_protected); - else if (DT->isPrivate()) - addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_private); - // Otherwise C++ member and base classes are considered public. - else if (DT->isPublic()) - addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_public); + addAccess(MemberDie, DT->getFlags()); + if (DT->isVirtual()) addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, dwarf::DW_VIRTUALITY_virtual); @@ -1717,15 +1720,7 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) { // FIXME: We could omit private if the parent is a class_type, and // public if the parent is something else. - if (DT->isProtected()) - addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_protected); - else if (DT->isPrivate()) - addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_private); - else if (DT->isPublic()) - addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1, - dwarf::DW_ACCESS_public); + addAccess(StaticMemberDIE, DT->getFlags()); if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT->getConstant())) addConstantValue(StaticMemberDIE, CI, Ty); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 8140279adaef..54b0079dd7ce 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -226,6 +226,9 @@ public: /// Add thrown types. void addThrownTypes(DIE &Die, DINodeArray ThrownTypes); + /// Add the accessibility attribute. + void addAccess(DIE &Die, DINode::DIFlags Flags); + /// Add a new type attribute to the specified entity. /// /// This takes and attribute parameter because DW_AT_friend attributes are diff --git a/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp index a9fb31d42679..3ade262d9af2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp @@ -112,16 +112,12 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info, EmitCamlGlobal(M, AP, "frametable"); int NumDescriptors = 0; - for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(), - IE = Info.funcinfo_end(); - I != IE; ++I) { - GCFunctionInfo &FI = **I; - if (FI.getStrategy().getName() != getStrategy().getName()) + for (std::unique_ptr<GCFunctionInfo> &FI : + llvm::make_range(Info.funcinfo_begin(), Info.funcinfo_end())) { + if (FI->getStrategy().getName() != getStrategy().getName()) // this function is managed by some other GC continue; - for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) { - NumDescriptors++; - } + NumDescriptors += FI->size(); } if (NumDescriptors >= 1 << 16) { @@ -131,35 +127,34 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info, AP.emitInt16(NumDescriptors); AP.emitAlignment(IntPtrSize == 4 ? Align(4) : Align(8)); - for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(), - IE = Info.funcinfo_end(); - I != IE; ++I) { - GCFunctionInfo &FI = **I; - if (FI.getStrategy().getName() != getStrategy().getName()) + for (std::unique_ptr<GCFunctionInfo> &FI : + llvm::make_range(Info.funcinfo_begin(), Info.funcinfo_end())) { + if (FI->getStrategy().getName() != getStrategy().getName()) // this function is managed by some other GC continue; - uint64_t FrameSize = FI.getFrameSize(); + uint64_t FrameSize = FI->getFrameSize(); if (FrameSize >= 1 << 16) { // Very rude! - report_fatal_error("Function '" + FI.getFunction().getName() + + report_fatal_error("Function '" + FI->getFunction().getName() + "' is too large for the ocaml GC! " "Frame size " + Twine(FrameSize) + ">= 65536.\n" "(" + - Twine(reinterpret_cast<uintptr_t>(&FI)) + ")"); + Twine(reinterpret_cast<uintptr_t>(FI.get())) + ")"); } AP.OutStreamer->AddComment("live roots for " + - Twine(FI.getFunction().getName())); + Twine(FI->getFunction().getName())); AP.OutStreamer->AddBlankLine(); - for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) { - size_t LiveCount = FI.live_size(J); + for (GCFunctionInfo::iterator J = FI->begin(), JE = FI->end(); J != JE; + ++J) { + size_t LiveCount = FI->live_size(J); if (LiveCount >= 1 << 16) { // Very rude! - report_fatal_error("Function '" + FI.getFunction().getName() + + report_fatal_error("Function '" + FI->getFunction().getName() + "' is too large for the ocaml GC! " "Live root count " + Twine(LiveCount) + " >= 65536."); @@ -169,8 +164,8 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info, AP.emitInt16(FrameSize); AP.emitInt16(LiveCount); - for (GCFunctionInfo::live_iterator K = FI.live_begin(J), - KE = FI.live_end(J); + for (GCFunctionInfo::live_iterator K = FI->live_begin(J), + KE = FI->live_end(J); K != KE; ++K) { if (K->StackOffset >= 1 << 16) { // Very rude! diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp index 9e6f1a537de3..bab187f46535 100644 --- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.cpp @@ -47,7 +47,6 @@ void PseudoProbeHandler::emitPseudoProbe(uint64_t Guid, uint64_t Index, InlinedAt = InlinedAt->getInlinedAt(); } - SmallVector<InlineSite, 8> InlineStack(ReversedInlineStack.rbegin(), - ReversedInlineStack.rend()); + SmallVector<InlineSite, 8> InlineStack(llvm::reverse(ReversedInlineStack)); Asm->OutStreamer->emitPseudoProbe(Guid, Index, Type, Attr, InlineStack); } diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index 64dadc82b48b..0ff67f7ca00a 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -1125,8 +1125,8 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) { // If this is a large problem, avoid visiting the same basic blocks multiple // times. if (MergePotentials.size() == TailMergeThreshold) - for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i) - TriedMerging.insert(MergePotentials[i].getBlock()); + for (MergePotentialsElt &Elt : MergePotentials) + TriedMerging.insert(Elt.getBlock()); if (MergePotentials.size() >= 2) MadeChange |= TryTailMergeBlocks(IBB, PredBB, MinCommonTailLength); diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp index 863a0e1e0b56..5f9982cd155d 100644 --- a/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -15,13 +15,13 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/CodeGen/StackMaps.h" #include <cassert> #include <tuple> @@ -35,7 +35,7 @@ void VirtRegAuxInfo::calculateSpillWeightsAndHints() { MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - unsigned Reg = Register::index2VirtReg(I); + Register Reg = Register::index2VirtReg(I); if (MRI.reg_nodbg_empty(Reg)) continue; calculateSpillWeightAndHint(LIS.getInterval(Reg)); @@ -64,14 +64,14 @@ static Register copyHint(const MachineInstr *MI, unsigned Reg, if (Register::isVirtualRegister(HReg)) return Sub == HSub ? HReg : Register(); - const TargetRegisterClass *rc = MRI.getRegClass(Reg); + const TargetRegisterClass *RC = MRI.getRegClass(Reg); MCRegister CopiedPReg = HSub ? TRI.getSubReg(HReg, HSub) : HReg.asMCReg(); - if (rc->contains(CopiedPReg)) + if (RC->contains(CopiedPReg)) return CopiedPReg; // Check if reg:sub matches so that a super register could be hinted. if (Sub) - return TRI.getMatchingSuperReg(CopiedPReg, Sub, rc); + return TRI.getMatchingSuperReg(CopiedPReg, Sub, RC); return 0; } @@ -80,8 +80,8 @@ static Register copyHint(const MachineInstr *MI, unsigned Reg, static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, const VirtRegMap &VRM, const TargetInstrInfo &TII) { - unsigned Reg = LI.reg(); - unsigned Original = VRM.getOriginal(Reg); + Register Reg = LI.reg(); + Register Original = VRM.getOriginal(Reg); for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); I != E; ++I) { const VNInfo *VNI = *I; @@ -183,8 +183,8 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, bool ShouldUpdateLI = !IsLocalSplitArtifact; if (IsLocalSplitArtifact) { - MachineBasicBlock *localMBB = LIS.getMBBFromIndex(*End); - assert(localMBB == LIS.getMBBFromIndex(*Start) && + MachineBasicBlock *LocalMBB = LIS.getMBBFromIndex(*End); + assert(LocalMBB == LIS.getMBBFromIndex(*Start) && "start and end are expected to be in the same basic block"); // Local split artifact will have 2 additional copy instructions and they @@ -192,8 +192,8 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, // localLI = COPY other // ... // other = COPY localLI - TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, localMBB); - TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, localMBB); + TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB); + TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB); NumInstr += 2; } diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index bbdd8aab502e..7c236a9785d8 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -68,6 +68,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeMachineCSEPass(Registry); initializeMachineCombinerPass(Registry); initializeMachineCopyPropagationPass(Registry); + initializeMachineCycleInfoPrinterPassPass(Registry); + initializeMachineCycleInfoWrapperPassPass(Registry); initializeMachineDominatorTreePass(Registry); initializeMachineFunctionPrinterPassPass(Registry); initializeMachineLICMPass(Registry); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index ac4180c4c3ab..747f4e4fdecc 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -4831,9 +4831,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal, TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI); - for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { - TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - + for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) { // Compute the constraint code and ConstraintType to use. TLI.ComputeConstraintToUse(OpInfo, SDValue()); @@ -5617,9 +5615,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { TargetLowering::AsmOperandInfoVector TargetConstraints = TLI->ParseConstraints(*DL, TRI, *CS); unsigned ArgNo = 0; - for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { - TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - + for (TargetLowering::AsmOperandInfo &OpInfo : TargetConstraints) { // Compute the constraint code and ConstraintType to use. TLI->ComputeConstraintToUse(OpInfo, SDValue()); @@ -6856,8 +6852,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) { // Use reverse iterator because later select may use the value of the // earlier select, and we need to propagate value through earlier select // to get the PHI operand. - for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) { - SelectInst *SI = *It; + for (SelectInst *SI : llvm::reverse(ASI)) { // The select itself is replaced with a PHI Node. PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front()); PN->takeName(SI); diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp index 4e98d49206b5..901409ea9f8f 100644 --- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp +++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp @@ -405,8 +405,7 @@ findSuitableFreeRegister(RegRefIter RegRefBegin, const TargetRegisterClass *RC, SmallVectorImpl<unsigned> &Forbid) { ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(RC); - for (unsigned i = 0; i != Order.size(); ++i) { - unsigned NewReg = Order[i]; + for (unsigned NewReg : Order) { // Don't replace a register with itself. if (NewReg == AntiDepReg) continue; // Don't replace a register with one that was recently used to repair diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp index 0bb186a02416..5579152f1ce0 100644 --- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp +++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp @@ -142,9 +142,9 @@ bool DeadMachineInstructionElim::eliminateDeadMI(MachineFunction &MF) { if (isDead(&MI)) { LLVM_DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << MI); // It is possible that some DBG_VALUE instructions refer to this - // instruction. They get marked as undef and will be deleted - // in the live debug variable analysis. - MI.eraseFromParentAndMarkDBGValuesForRemoval(); + // instruction. They will be deleted in the live debug variable + // analysis. + MI.eraseFromParent(); AnyChanges = true; ++NumDeletes; continue; diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index 90883212a275..0b5469b02637 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -210,9 +210,9 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) { // Check all instructions, except the terminators. It is assumed that // terminators never have side effects or define any used register values. - for (MachineBasicBlock::iterator I = MBB->begin(), - E = MBB->getFirstTerminator(); I != E; ++I) { - if (I->isDebugInstr()) + for (MachineInstr &MI : + llvm::make_range(MBB->begin(), MBB->getFirstTerminator())) { + if (MI.isDebugInstr()) continue; if (++InstrCount > BlockInstrLimit && !Stress) { @@ -222,28 +222,28 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) { } // There shouldn't normally be any phis in a single-predecessor block. - if (I->isPHI()) { - LLVM_DEBUG(dbgs() << "Can't hoist: " << *I); + if (MI.isPHI()) { + LLVM_DEBUG(dbgs() << "Can't hoist: " << MI); return false; } // Don't speculate loads. Note that it may be possible and desirable to // speculate GOT or constant pool loads that are guaranteed not to trap, // but we don't support that for now. - if (I->mayLoad()) { - LLVM_DEBUG(dbgs() << "Won't speculate load: " << *I); + if (MI.mayLoad()) { + LLVM_DEBUG(dbgs() << "Won't speculate load: " << MI); return false; } // We never speculate stores, so an AA pointer isn't necessary. bool DontMoveAcrossStore = true; - if (!I->isSafeToMove(nullptr, DontMoveAcrossStore)) { - LLVM_DEBUG(dbgs() << "Can't speculate: " << *I); + if (!MI.isSafeToMove(nullptr, DontMoveAcrossStore)) { + LLVM_DEBUG(dbgs() << "Can't speculate: " << MI); return false; } // Check for any dependencies on Head instructions. - if (!InstrDependenciesAllowIfConv(&(*I))) + if (!InstrDependenciesAllowIfConv(&MI)) return false; } return true; diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 17094a8e44f8..d061664e8c5d 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -256,7 +256,7 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs, LLT PartLLT = MRI.getType(SrcRegs[0]); // Deal with v3s16 split into v2s16 - LLT LCMTy = getLCMType(LLTy, PartLLT); + LLT LCMTy = getCoverTy(LLTy, PartLLT); if (LCMTy == LLTy) { // Common case where no padding is needed. assert(DstRegs.size() == 1); @@ -267,21 +267,9 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs, // widening the original value. Register UnmergeSrcReg; if (LCMTy != PartLLT) { - // e.g. A <3 x s16> value was split to <2 x s16> - // %register_value0:_(<2 x s16>) - // %register_value1:_(<2 x s16>) - // %undef:_(<2 x s16>) = G_IMPLICIT_DEF - // %concat:_<6 x s16>) = G_CONCAT_VECTORS %reg_value0, %reg_value1, %undef - // %dst_reg:_(<3 x s16>), %dead:_(<3 x s16>) = G_UNMERGE_VALUES %concat - const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); - Register Undef = B.buildUndef(PartLLT).getReg(0); - - // Build vector of undefs. - SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); - - // Replace the first sources with the real registers. - std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); - UnmergeSrcReg = B.buildConcatVectors(LCMTy, WidenedSrcs).getReg(0); + assert(DstRegs.size() == 1); + return B.buildDeleteTrailingVectorElements(DstRegs[0], + B.buildMerge(LCMTy, SrcRegs)); } else { // We don't need to widen anything if we're extracting a scalar which was // promoted to a vector e.g. s8 -> v4s8 -> s8 @@ -298,6 +286,8 @@ mergeVectorRegsToResultRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs, for (int I = DstRegs.size(); I != NumDst; ++I) PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); + if (PadDstRegs.size() == 1) + return B.buildDeleteTrailingVectorElements(DstRegs[0], UnmergeSrcReg); return B.buildUnmerge(PadDstRegs, UnmergeSrcReg); } @@ -485,7 +475,7 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs, MachineRegisterInfo &MRI = *B.getMRI(); LLT DstTy = MRI.getType(DstRegs[0]); - LLT LCMTy = getLCMType(SrcTy, PartTy); + LLT LCMTy = getCoverTy(SrcTy, PartTy); const unsigned DstSize = DstTy.getSizeInBits(); const unsigned SrcSize = SrcTy.getSizeInBits(); @@ -493,7 +483,7 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs, Register UnmergeSrc = SrcReg; - if (CoveringSize != SrcSize) { + if (!LCMTy.isVector() && CoveringSize != SrcSize) { // For scalars, it's common to be able to use a simple extension. if (SrcTy.isScalar() && DstTy.isScalar()) { CoveringSize = alignTo(SrcSize, DstSize); @@ -510,14 +500,10 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs, } } - // Unmerge to the original registers and pad with dead defs. - SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end()); - for (unsigned Size = DstSize * DstRegs.size(); Size != CoveringSize; - Size += DstSize) { - UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy)); - } + if (LCMTy.isVector() && CoveringSize != SrcSize) + UnmergeSrc = B.buildPadVectorWithUndefElements(LCMTy, SrcReg).getReg(0); - B.buildUnmerge(UnmergeResults, UnmergeSrc); + B.buildUnmerge(DstRegs, UnmergeSrc); } bool CallLowering::determineAndHandleAssignments( diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp index 381c6df5c97a..dd1ef74e8ad0 100644 --- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp @@ -135,7 +135,7 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF, // Erase dead insts before even adding to the list. if (isTriviallyDead(CurMI, *MRI)) { LLVM_DEBUG(dbgs() << CurMI << "Is dead; erasing.\n"); - CurMI.eraseFromParentAndMarkDBGValuesForRemoval(); + CurMI.eraseFromParent(); continue; } WorkList.deferred_insert(&CurMI); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 755b3b844570..f7a634dad61a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1551,8 +1551,8 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI, Builder.buildInstr(MatchInfo.Logic->getOpcode(), {Dest}, {Shift1, Shift2}); // These were one use so it's safe to remove them. - MatchInfo.Shift2->eraseFromParentAndMarkDBGValuesForRemoval(); - MatchInfo.Logic->eraseFromParentAndMarkDBGValuesForRemoval(); + MatchInfo.Shift2->eraseFromParent(); + MatchInfo.Logic->eraseFromParent(); MI.eraseFromParent(); } diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 87cc60d51bc2..6d415c9c7f90 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -338,9 +338,10 @@ bool IRTranslator::translateCompare(const User &U, MIRBuilder.buildCopy( Res, getOrCreateVReg(*Constant::getAllOnesValue(U.getType()))); else { - assert(CI && "Instruction should be CmpInst"); - MIRBuilder.buildFCmp(Pred, Res, Op0, Op1, - MachineInstr::copyFlagsFromInstruction(*CI)); + uint16_t Flags = 0; + if (CI) + Flags = MachineInstr::copyFlagsFromInstruction(*CI); + MIRBuilder.buildFCmp(Pred, Res, Op0, Op1, Flags); } return true; @@ -3502,7 +3503,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) { // Get rid of the now empty basic block. EntryBB->removeSuccessor(&NewEntryBB); MF->remove(EntryBB); - MF->DeleteMachineBasicBlock(EntryBB); + MF->deleteMachineBasicBlock(EntryBB); assert(&MF->front() == &NewEntryBB && "New entry wasn't next in the list of basic block!"); diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 9b2692486384..b10c9272a508 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -163,7 +163,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { // If so, erase it. if (isTriviallyDead(MI, MRI)) { LLVM_DEBUG(dbgs() << "Is dead; erasing.\n"); - MI.eraseFromParentAndMarkDBGValuesForRemoval(); + MI.eraseFromParent(); continue; } @@ -255,8 +255,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) { MachineInstr *MI = nullptr; if (!MRI.def_empty(VReg)) MI = &*MRI.def_instr_begin(VReg); - else if (!MRI.use_empty(VReg)) + else if (!MRI.use_empty(VReg)) { MI = &*MRI.use_instr_begin(VReg); + // Debug value instruction is permitted to use undefined vregs. + if (MI->isDebugValue()) + continue; + } if (!MI) continue; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index e09cd26eb0c1..e8a8efd5dad4 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -176,16 +176,18 @@ bool LegalizerHelper::extractParts(Register Reg, LLT RegTy, return true; } + // Perform irregular split. Leftover is last element of RegPieces. if (MainTy.isVector()) { - unsigned EltSize = MainTy.getScalarSizeInBits(); - if (LeftoverSize % EltSize != 0) - return false; - LeftoverTy = LLT::scalarOrVector( - ElementCount::getFixed(LeftoverSize / EltSize), EltSize); - } else { - LeftoverTy = LLT::scalar(LeftoverSize); + SmallVector<Register, 8> RegPieces; + extractVectorParts(Reg, MainTy.getNumElements(), RegPieces); + for (unsigned i = 0; i < RegPieces.size() - 1; ++i) + VRegs.push_back(RegPieces[i]); + LeftoverRegs.push_back(RegPieces[RegPieces.size() - 1]); + LeftoverTy = MRI.getType(LeftoverRegs[0]); + return true; } + LeftoverTy = LLT::scalar(LeftoverSize); // For irregular sizes, extract the individual parts. for (unsigned I = 0; I != NumParts; ++I) { Register NewReg = MRI.createGenericVirtualRegister(MainTy); @@ -203,6 +205,44 @@ bool LegalizerHelper::extractParts(Register Reg, LLT RegTy, return true; } +void LegalizerHelper::extractVectorParts(Register Reg, unsigned NumElts, + SmallVectorImpl<Register> &VRegs) { + LLT RegTy = MRI.getType(Reg); + assert(RegTy.isVector() && "Expected a vector type"); + + LLT EltTy = RegTy.getElementType(); + LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy); + unsigned RegNumElts = RegTy.getNumElements(); + unsigned LeftoverNumElts = RegNumElts % NumElts; + unsigned NumNarrowTyPieces = RegNumElts / NumElts; + + // Perfect split without leftover + if (LeftoverNumElts == 0) + return extractParts(Reg, NarrowTy, NumNarrowTyPieces, VRegs); + + // Irregular split. Provide direct access to all elements for artifact + // combiner using unmerge to elements. Then build vectors with NumElts + // elements. Remaining element(s) will be (used to build vector) Leftover. + SmallVector<Register, 8> Elts; + extractParts(Reg, EltTy, RegNumElts, Elts); + + unsigned Offset = 0; + // Requested sub-vectors of NarrowTy. + for (unsigned i = 0; i < NumNarrowTyPieces; ++i, Offset += NumElts) { + ArrayRef<Register> Pieces(&Elts[Offset], NumElts); + VRegs.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0)); + } + + // Leftover element(s). + if (LeftoverNumElts == 1) { + VRegs.push_back(Elts[Offset]); + } else { + LLT LeftoverTy = LLT::fixed_vector(LeftoverNumElts, EltTy); + ArrayRef<Register> Pieces(&Elts[Offset], LeftoverNumElts); + VRegs.push_back(MIRBuilder.buildMerge(LeftoverTy, Pieces).getReg(0)); + } +} + void LegalizerHelper::insertParts(Register DstReg, LLT ResultTy, LLT PartTy, ArrayRef<Register> PartRegs, @@ -223,6 +263,15 @@ void LegalizerHelper::insertParts(Register DstReg, return; } + // Merge sub-vectors with different number of elements and insert into DstReg. + if (ResultTy.isVector()) { + assert(LeftoverRegs.size() == 1 && "Expected one leftover register"); + SmallVector<Register, 8> AllRegs; + for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs)) + AllRegs.push_back(Reg); + return mergeMixedSubvectors(DstReg, AllRegs); + } + SmallVector<Register> GCDRegs; LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy); for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs)) @@ -231,6 +280,30 @@ void LegalizerHelper::insertParts(Register DstReg, buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs); } +void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts, + Register Reg) { + LLT Ty = MRI.getType(Reg); + SmallVector<Register, 8> RegElts; + extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts); + Elts.append(RegElts); +} + +/// Merge \p PartRegs with different types into \p DstReg. +void LegalizerHelper::mergeMixedSubvectors(Register DstReg, + ArrayRef<Register> PartRegs) { + SmallVector<Register, 8> AllElts; + for (unsigned i = 0; i < PartRegs.size() - 1; ++i) + appendVectorElts(AllElts, PartRegs[i]); + + Register Leftover = PartRegs[PartRegs.size() - 1]; + if (MRI.getType(Leftover).isScalar()) + AllElts.push_back(Leftover); + else + appendVectorElts(AllElts, Leftover); + + MIRBuilder.buildMerge(DstReg, AllElts); +} + /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs. static void getUnmergeResults(SmallVectorImpl<Register> &Regs, const MachineInstr &MI) { @@ -916,8 +989,26 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI, return Legalized; } - case TargetOpcode::G_FREEZE: - return reduceOperationWidth(MI, TypeIdx, NarrowTy); + case TargetOpcode::G_FREEZE: { + if (TypeIdx != 0) + return UnableToLegalize; + + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + // Should widen scalar first + if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0) + return UnableToLegalize; + + auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg()); + SmallVector<Register, 8> Parts; + for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) { + Parts.push_back( + MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0)); + } + + MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Parts); + MI.eraseFromParent(); + return Legalized; + } case TargetOpcode::G_ADD: case TargetOpcode::G_SUB: case TargetOpcode::G_SADDO: @@ -1372,37 +1463,17 @@ void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx) { MachineOperand &MO = MI.getOperand(OpIdx); MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); - MO.setReg(widenWithUnmerge(WideTy, MO.getReg())); + Register Dst = MO.getReg(); + Register DstExt = MRI.createGenericVirtualRegister(WideTy); + MO.setReg(DstExt); + MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt); } void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, unsigned OpIdx) { MachineOperand &MO = MI.getOperand(OpIdx); - - LLT OldTy = MRI.getType(MO.getReg()); - unsigned OldElts = OldTy.getNumElements(); - unsigned NewElts = MoreTy.getNumElements(); - - unsigned NumParts = NewElts / OldElts; - - // Use concat_vectors if the result is a multiple of the number of elements. - if (NumParts * OldElts == NewElts) { - SmallVector<Register, 8> Parts; - Parts.push_back(MO.getReg()); - - Register ImpDef = MIRBuilder.buildUndef(OldTy).getReg(0); - for (unsigned I = 1; I != NumParts; ++I) - Parts.push_back(ImpDef); - - auto Concat = MIRBuilder.buildConcatVectors(MoreTy, Parts); - MO.setReg(Concat.getReg(0)); - return; - } - - Register MoreReg = MRI.createGenericVirtualRegister(MoreTy); - Register ImpDef = MIRBuilder.buildUndef(MoreTy).getReg(0); - MIRBuilder.buildInsert(MoreReg, ImpDef, MO.getReg(), 0); - MO.setReg(MoreReg); + SmallVector<Register, 8> Regs; + MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0)); } void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { @@ -3558,20 +3629,83 @@ Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy, return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0); } -LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef( - MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { - Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT LCMTy = getLCMType(DstTy, NarrowTy); +#ifndef NDEBUG +/// Check that all vector operands have same number of elements. Other operands +/// should be listed in NonVecOp. +static bool hasSameNumEltsOnAllVectorOperands( + GenericMachineInstr &MI, MachineRegisterInfo &MRI, + std::initializer_list<unsigned> NonVecOpIndices) { + if (MI.getNumMemOperands() != 0) + return false; - unsigned NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits(); + LLT VecTy = MRI.getType(MI.getReg(0)); + if (!VecTy.isVector()) + return false; + unsigned NumElts = VecTy.getNumElements(); - auto NewUndef = MIRBuilder.buildUndef(NarrowTy); - SmallVector<Register, 8> Parts(NumParts, NewUndef.getReg(0)); + for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) { + MachineOperand &Op = MI.getOperand(OpIdx); + if (!Op.isReg()) { + if (!is_contained(NonVecOpIndices, OpIdx)) + return false; + continue; + } - buildWidenedRemergeToDst(DstReg, LCMTy, Parts); - MI.eraseFromParent(); - return Legalized; + LLT Ty = MRI.getType(Op.getReg()); + if (!Ty.isVector()) { + if (!is_contained(NonVecOpIndices, OpIdx)) + return false; + is_contained(NonVecOpIndices, OpIdx); + continue; + } + + if (Ty.getNumElements() != NumElts) + return false; + } + + return true; +} +#endif + +/// Fill \p DstOps with DstOps that have same number of elements combined as +/// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are +/// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple +/// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements. +static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty, + unsigned NumElts) { + LLT LeftoverTy; + assert(Ty.isVector() && "Expected vector type"); + LLT EltTy = Ty.getElementType(); + LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy); + int NumParts, NumLeftover; + std::tie(NumParts, NumLeftover) = + getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy); + + assert(NumParts > 0 && "Error in getNarrowTypeBreakDown"); + for (int i = 0; i < NumParts; ++i) { + DstOps.push_back(NarrowTy); + } + + if (LeftoverTy.isValid()) { + assert(NumLeftover == 1 && "expected exactly one leftover"); + DstOps.push_back(LeftoverTy); + } +} + +/// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps +/// made from \p Op depending on operand type. +static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N, + MachineOperand &Op) { + for (unsigned i = 0; i < N; ++i) { + if (Op.isReg()) + Ops.push_back(Op.getReg()); + else if (Op.isImm()) + Ops.push_back(Op.getImm()); + else if (Op.isPredicate()) + Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate())); + else + llvm_unreachable("Unsupported type"); + } } // Handle splitting vector operations which need to have the same number of @@ -3588,335 +3722,116 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef( // s64 = G_SHL s64, s32 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorMultiEltType( - MachineInstr &MI, unsigned TypeIdx, LLT NarrowTyArg) { - if (TypeIdx != 0) - return UnableToLegalize; + GenericMachineInstr &MI, unsigned NumElts, + std::initializer_list<unsigned> NonVecOpIndices) { + assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) && + "Non-compatible opcode or not specified non-vector operands"); + unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements(); - const LLT NarrowTy0 = NarrowTyArg; - const Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT LeftoverTy0; - - // All of the operands need to have the same number of elements, so if we can - // determine a type breakdown for the result type, we can for all of the - // source types. - int NumParts = getNarrowTypeBreakDown(DstTy, NarrowTy0, LeftoverTy0).first; - if (NumParts < 0) - return UnableToLegalize; + unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs(); + unsigned NumDefs = MI.getNumDefs(); - SmallVector<MachineInstrBuilder, 4> NewInsts; - - SmallVector<Register, 4> DstRegs, LeftoverDstRegs; - SmallVector<Register, 4> PartRegs, LeftoverRegs; - - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) { - Register SrcReg = MI.getOperand(I).getReg(); - LLT SrcTyI = MRI.getType(SrcReg); - const auto NewEC = NarrowTy0.isVector() ? NarrowTy0.getElementCount() - : ElementCount::getFixed(1); - LLT NarrowTyI = LLT::scalarOrVector(NewEC, SrcTyI.getScalarType()); - LLT LeftoverTyI; - - // Split this operand into the requested typed registers, and any leftover - // required to reproduce the original type. - if (!extractParts(SrcReg, SrcTyI, NarrowTyI, LeftoverTyI, PartRegs, - LeftoverRegs)) - return UnableToLegalize; - - if (I == 1) { - // For the first operand, create an instruction for each part and setup - // the result. - for (Register PartReg : PartRegs) { - Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy0); - NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode()) - .addDef(PartDstReg) - .addUse(PartReg)); - DstRegs.push_back(PartDstReg); - } + // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output. + // Build instructions with DstOps to use instruction found by CSE directly. + // CSE copies found instruction into given vreg when building with vreg dest. + SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs); + // Output registers will be taken from created instructions. + SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs); + for (unsigned i = 0; i < NumDefs; ++i) { + makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts); + } - for (Register LeftoverReg : LeftoverRegs) { - Register PartDstReg = MRI.createGenericVirtualRegister(LeftoverTy0); - NewInsts.push_back(MIRBuilder.buildInstrNoInsert(MI.getOpcode()) - .addDef(PartDstReg) - .addUse(LeftoverReg)); - LeftoverDstRegs.push_back(PartDstReg); - } + // Split vector input operands into sub-vectors with NumElts elts + Leftover. + // Operands listed in NonVecOpIndices will be used as is without splitting; + // examples: compare predicate in icmp and fcmp (op 1), vector select with i1 + // scalar condition (op 1), immediate in sext_inreg (op 2). + SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs); + for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands(); + ++UseIdx, ++UseNo) { + if (is_contained(NonVecOpIndices, UseIdx)) { + broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(), + MI.getOperand(UseIdx)); } else { - assert(NewInsts.size() == PartRegs.size() + LeftoverRegs.size()); - - // Add the newly created operand splits to the existing instructions. The - // odd-sized pieces are ordered after the requested NarrowTyArg sized - // pieces. - unsigned InstCount = 0; - for (unsigned J = 0, JE = PartRegs.size(); J != JE; ++J) - NewInsts[InstCount++].addUse(PartRegs[J]); - for (unsigned J = 0, JE = LeftoverRegs.size(); J != JE; ++J) - NewInsts[InstCount++].addUse(LeftoverRegs[J]); + SmallVector<Register, 8> SplitPieces; + extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces); + for (auto Reg : SplitPieces) + InputOpsPieces[UseNo].push_back(Reg); } - - PartRegs.clear(); - LeftoverRegs.clear(); } - // Insert the newly built operations and rebuild the result register. - for (auto &MIB : NewInsts) - MIRBuilder.insertInstr(MIB); + unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0; - insertParts(DstReg, DstTy, NarrowTy0, DstRegs, LeftoverTy0, LeftoverDstRegs); + // Take i-th piece of each input operand split and build sub-vector/scalar + // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s). + for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) { + SmallVector<DstOp, 2> Defs; + for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo) + Defs.push_back(OutputOpsPieces[DstNo][i]); - MI.eraseFromParent(); - return Legalized; -} + SmallVector<SrcOp, 3> Uses; + for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo) + Uses.push_back(InputOpsPieces[InputNo][i]); -LegalizerHelper::LegalizeResult -LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx, - LLT NarrowTy) { - if (TypeIdx != 0) - return UnableToLegalize; - - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(SrcReg); - - LLT NarrowTy0 = NarrowTy; - LLT NarrowTy1; - unsigned NumParts; - - if (NarrowTy.isVector()) { - // Uneven breakdown not handled. - NumParts = DstTy.getNumElements() / NarrowTy.getNumElements(); - if (NumParts * NarrowTy.getNumElements() != DstTy.getNumElements()) - return UnableToLegalize; - - NarrowTy1 = LLT::vector(NarrowTy.getElementCount(), SrcTy.getElementType()); - } else { - NumParts = DstTy.getNumElements(); - NarrowTy1 = SrcTy.getElementType(); + auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags()); + for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo) + OutputRegs[DstNo].push_back(I.getReg(DstNo)); } - SmallVector<Register, 4> SrcRegs, DstRegs; - extractParts(SrcReg, NarrowTy1, NumParts, SrcRegs); - - for (unsigned I = 0; I < NumParts; ++I) { - Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0); - MachineInstr *NewInst = - MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]}); - - NewInst->setFlags(MI.getFlags()); - DstRegs.push_back(DstReg); + // Merge small outputs into MI's output for each def operand. + if (NumLeftovers) { + for (unsigned i = 0; i < NumDefs; ++i) + mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]); + } else { + for (unsigned i = 0; i < NumDefs; ++i) + MIRBuilder.buildMerge(MI.getReg(i), OutputRegs[i]); } - if (NarrowTy.isVector()) - MIRBuilder.buildConcatVectors(DstReg, DstRegs); - else - MIRBuilder.buildBuildVector(DstReg, DstRegs); - MI.eraseFromParent(); return Legalized; } LegalizerHelper::LegalizeResult -LegalizerHelper::fewerElementsVectorCmp(MachineInstr &MI, unsigned TypeIdx, - LLT NarrowTy) { - Register DstReg = MI.getOperand(0).getReg(); - Register Src0Reg = MI.getOperand(2).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(Src0Reg); +LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI, + unsigned NumElts) { + unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements(); - unsigned NumParts; - LLT NarrowTy0, NarrowTy1; + unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs(); + unsigned NumDefs = MI.getNumDefs(); - if (TypeIdx == 0) { - unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1; - unsigned OldElts = DstTy.getNumElements(); - - NarrowTy0 = NarrowTy; - NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : DstTy.getNumElements(); - NarrowTy1 = NarrowTy.isVector() ? LLT::vector(NarrowTy.getElementCount(), - SrcTy.getScalarSizeInBits()) - : SrcTy.getElementType(); - - } else { - unsigned NewElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1; - unsigned OldElts = SrcTy.getNumElements(); + SmallVector<DstOp, 8> OutputOpsPieces; + SmallVector<Register, 8> OutputRegs; + makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts); - NumParts = NarrowTy.isVector() ? (OldElts / NewElts) : - NarrowTy.getNumElements(); - NarrowTy0 = - LLT::vector(NarrowTy.getElementCount(), DstTy.getScalarSizeInBits()); - NarrowTy1 = NarrowTy; + // Instructions that perform register split will be inserted in basic block + // where register is defined (basic block is in the next operand). + SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2); + for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands(); + UseIdx += 2, ++UseNo) { + MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB(); + MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); + extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo]); } - // FIXME: Don't know how to handle the situation where the small vectors - // aren't all the same size yet. - if (NarrowTy1.isVector() && - NarrowTy1.getNumElements() * NumParts != DstTy.getNumElements()) - return UnableToLegalize; - - CmpInst::Predicate Pred - = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + // Build PHIs with fewer elements. + unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0; + MIRBuilder.setInsertPt(*MI.getParent(), MI); + for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) { + auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI); + Phi.addDef( + MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI))); + OutputRegs.push_back(Phi.getReg(0)); - SmallVector<Register, 2> Src1Regs, Src2Regs, DstRegs; - extractParts(MI.getOperand(2).getReg(), NarrowTy1, NumParts, Src1Regs); - extractParts(MI.getOperand(3).getReg(), NarrowTy1, NumParts, Src2Regs); - - for (unsigned I = 0; I < NumParts; ++I) { - Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0); - DstRegs.push_back(DstReg); - - if (MI.getOpcode() == TargetOpcode::G_ICMP) - MIRBuilder.buildICmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]); - else { - MachineInstr *NewCmp - = MIRBuilder.buildFCmp(Pred, DstReg, Src1Regs[I], Src2Regs[I]); - NewCmp->setFlags(MI.getFlags()); + for (unsigned j = 0; j < NumInputs / 2; ++j) { + Phi.addUse(InputOpsPieces[j][i]); + Phi.add(MI.getOperand(1 + j * 2 + 1)); } } - if (NarrowTy1.isVector()) - MIRBuilder.buildConcatVectors(DstReg, DstRegs); - else - MIRBuilder.buildBuildVector(DstReg, DstRegs); - - MI.eraseFromParent(); - return Legalized; -} - -LegalizerHelper::LegalizeResult -LegalizerHelper::fewerElementsVectorSelect(MachineInstr &MI, unsigned TypeIdx, - LLT NarrowTy) { - Register DstReg = MI.getOperand(0).getReg(); - Register CondReg = MI.getOperand(1).getReg(); - - unsigned NumParts = 0; - LLT NarrowTy0, NarrowTy1; - - LLT DstTy = MRI.getType(DstReg); - LLT CondTy = MRI.getType(CondReg); - unsigned Size = DstTy.getSizeInBits(); - - assert(TypeIdx == 0 || CondTy.isVector()); - - if (TypeIdx == 0) { - NarrowTy0 = NarrowTy; - NarrowTy1 = CondTy; - - unsigned NarrowSize = NarrowTy0.getSizeInBits(); - // FIXME: Don't know how to handle the situation where the small vectors - // aren't all the same size yet. - if (Size % NarrowSize != 0) - return UnableToLegalize; - - NumParts = Size / NarrowSize; - - // Need to break down the condition type - if (CondTy.isVector()) { - if (CondTy.getNumElements() == NumParts) - NarrowTy1 = CondTy.getElementType(); - else - NarrowTy1 = - LLT::vector(CondTy.getElementCount().divideCoefficientBy(NumParts), - CondTy.getScalarSizeInBits()); - } + // Merge small outputs into MI's def. + if (NumLeftovers) { + mergeMixedSubvectors(MI.getReg(0), OutputRegs); } else { - NumParts = CondTy.getNumElements(); - if (NarrowTy.isVector()) { - // TODO: Handle uneven breakdown. - if (NumParts * NarrowTy.getNumElements() != CondTy.getNumElements()) - return UnableToLegalize; - - return UnableToLegalize; - } else { - NarrowTy0 = DstTy.getElementType(); - NarrowTy1 = NarrowTy; - } - } - - SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs; - if (CondTy.isVector()) - extractParts(MI.getOperand(1).getReg(), NarrowTy1, NumParts, Src0Regs); - - extractParts(MI.getOperand(2).getReg(), NarrowTy0, NumParts, Src1Regs); - extractParts(MI.getOperand(3).getReg(), NarrowTy0, NumParts, Src2Regs); - - for (unsigned i = 0; i < NumParts; ++i) { - Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0); - MIRBuilder.buildSelect(DstReg, CondTy.isVector() ? Src0Regs[i] : CondReg, - Src1Regs[i], Src2Regs[i]); - DstRegs.push_back(DstReg); - } - - if (NarrowTy0.isVector()) - MIRBuilder.buildConcatVectors(DstReg, DstRegs); - else - MIRBuilder.buildBuildVector(DstReg, DstRegs); - - MI.eraseFromParent(); - return Legalized; -} - -LegalizerHelper::LegalizeResult -LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx, - LLT NarrowTy) { - const Register DstReg = MI.getOperand(0).getReg(); - LLT PhiTy = MRI.getType(DstReg); - LLT LeftoverTy; - - // All of the operands need to have the same number of elements, so if we can - // determine a type breakdown for the result type, we can for all of the - // source types. - int NumParts, NumLeftover; - std::tie(NumParts, NumLeftover) - = getNarrowTypeBreakDown(PhiTy, NarrowTy, LeftoverTy); - if (NumParts < 0) - return UnableToLegalize; - - SmallVector<Register, 4> DstRegs, LeftoverDstRegs; - SmallVector<MachineInstrBuilder, 4> NewInsts; - - const int TotalNumParts = NumParts + NumLeftover; - - // Insert the new phis in the result block first. - for (int I = 0; I != TotalNumParts; ++I) { - LLT Ty = I < NumParts ? NarrowTy : LeftoverTy; - Register PartDstReg = MRI.createGenericVirtualRegister(Ty); - NewInsts.push_back(MIRBuilder.buildInstr(TargetOpcode::G_PHI) - .addDef(PartDstReg)); - if (I < NumParts) - DstRegs.push_back(PartDstReg); - else - LeftoverDstRegs.push_back(PartDstReg); - } - - MachineBasicBlock *MBB = MI.getParent(); - MIRBuilder.setInsertPt(*MBB, MBB->getFirstNonPHI()); - insertParts(DstReg, PhiTy, NarrowTy, DstRegs, LeftoverTy, LeftoverDstRegs); - - SmallVector<Register, 4> PartRegs, LeftoverRegs; - - // Insert code to extract the incoming values in each predecessor block. - for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { - PartRegs.clear(); - LeftoverRegs.clear(); - - Register SrcReg = MI.getOperand(I).getReg(); - MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB(); - MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator()); - - LLT Unused; - if (!extractParts(SrcReg, PhiTy, NarrowTy, Unused, PartRegs, - LeftoverRegs)) - return UnableToLegalize; - - // Add the newly created operand splits to the existing instructions. The - // odd-sized pieces are ordered after the requested NarrowTyArg sized - // pieces. - for (int J = 0; J != TotalNumParts; ++J) { - MachineInstrBuilder MIB = NewInsts[J]; - MIB.addUse(J < NumParts ? PartRegs[J] : LeftoverRegs[J - NumParts]); - MIB.addMBB(&OpMBB); - } + MIRBuilder.buildMerge(MI.getReg(0), OutputRegs); } MI.eraseFromParent(); @@ -3927,27 +3842,36 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { - if (TypeIdx != 1) - return UnableToLegalize; - const int NumDst = MI.getNumOperands() - 1; const Register SrcReg = MI.getOperand(NumDst).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcTy = MRI.getType(SrcReg); - // TODO: Create sequence of extracts. - if (DstTy == NarrowTy) + if (TypeIdx != 1 || NarrowTy == DstTy) return UnableToLegalize; - LLT GCDTy = getGCDType(SrcTy, NarrowTy); - if (DstTy == GCDTy) { - // This would just be a copy of the same unmerge. - // TODO: Create extracts, pad with undef and create intermediate merges. + // Requires compatible types. Otherwise SrcReg should have been defined by + // merge-like instruction that would get artifact combined. Most likely + // instruction that defines SrcReg has to perform more/fewer elements + // legalization compatible with NarrowTy. + assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types"); + assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); + + if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) || + (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0)) return UnableToLegalize; - } - auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg); + // This is most likely DstTy (smaller then register size) packed in SrcTy + // (larger then register size) and since unmerge was not combined it will be + // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy + // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy. + + // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy) + // + // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence + // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg + // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy) + auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg); const int NumUnmerge = Unmerge->getNumOperands() - 1; const int PartsPerUnmerge = NumDst / NumUnmerge; @@ -3964,89 +3888,87 @@ LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI, } LegalizerHelper::LegalizeResult -LegalizerHelper::fewerElementsVectorMulo(MachineInstr &MI, unsigned TypeIdx, - LLT NarrowTy) { - Register Result = MI.getOperand(0).getReg(); - Register Overflow = MI.getOperand(1).getReg(); - Register LHS = MI.getOperand(2).getReg(); - Register RHS = MI.getOperand(3).getReg(); - - LLT SrcTy = MRI.getType(LHS); - if (!SrcTy.isVector()) +LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx, + LLT NarrowTy) { + Register DstReg = MI.getOperand(0).getReg(); + LLT DstTy = MRI.getType(DstReg); + LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + // Requires compatible types. Otherwise user of DstReg did not perform unmerge + // that should have been artifact combined. Most likely instruction that uses + // DstReg has to do more/fewer elements legalization compatible with NarrowTy. + assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types"); + assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); + if (NarrowTy == SrcTy) return UnableToLegalize; - LLT ElementType = SrcTy.getElementType(); - LLT OverflowElementTy = MRI.getType(Overflow).getElementType(); - const ElementCount NumResult = SrcTy.getElementCount(); - LLT GCDTy = getGCDType(SrcTy, NarrowTy); + // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use + // is for old mir tests. Since the changes to more/fewer elements it should no + // longer be possible to generate MIR like this when starting from llvm-ir + // because LCMTy approach was replaced with merge/unmerge to vector elements. + if (TypeIdx == 1) { + assert(SrcTy.isVector() && "Expected vector types"); + assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type"); + if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) || + (NarrowTy.getNumElements() >= SrcTy.getNumElements())) + return UnableToLegalize; + // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy) + // + // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy) + // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy) + // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4 + // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6 + // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8 + // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11 - // Unmerge the operands to smaller parts of GCD type. - auto UnmergeLHS = MIRBuilder.buildUnmerge(GCDTy, LHS); - auto UnmergeRHS = MIRBuilder.buildUnmerge(GCDTy, RHS); + SmallVector<Register, 8> Elts; + LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType(); + for (unsigned i = 1; i < MI.getNumOperands(); ++i) { + auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg()); + for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j) + Elts.push_back(Unmerge.getReg(j)); + } - const int NumOps = UnmergeLHS->getNumOperands() - 1; - const ElementCount PartsPerUnmerge = NumResult.divideCoefficientBy(NumOps); - LLT OverflowTy = LLT::scalarOrVector(PartsPerUnmerge, OverflowElementTy); - LLT ResultTy = LLT::scalarOrVector(PartsPerUnmerge, ElementType); + SmallVector<Register, 8> NarrowTyElts; + unsigned NumNarrowTyElts = NarrowTy.getNumElements(); + unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts; + for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces; + ++i, Offset += NumNarrowTyElts) { + ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts); + NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Pieces).getReg(0)); + } - // Perform the operation over unmerged parts. - SmallVector<Register, 8> ResultParts; - SmallVector<Register, 8> OverflowParts; - for (int I = 0; I != NumOps; ++I) { - Register Operand1 = UnmergeLHS->getOperand(I).getReg(); - Register Operand2 = UnmergeRHS->getOperand(I).getReg(); - auto PartMul = MIRBuilder.buildInstr(MI.getOpcode(), {ResultTy, OverflowTy}, - {Operand1, Operand2}); - ResultParts.push_back(PartMul->getOperand(0).getReg()); - OverflowParts.push_back(PartMul->getOperand(1).getReg()); + MIRBuilder.buildMerge(DstReg, NarrowTyElts); + MI.eraseFromParent(); + return Legalized; } - LLT ResultLCMTy = buildLCMMergePieces(SrcTy, NarrowTy, GCDTy, ResultParts); - LLT OverflowLCMTy = - LLT::scalarOrVector(ResultLCMTy.getElementCount(), OverflowElementTy); - - // Recombine the pieces to the original result and overflow registers. - buildWidenedRemergeToDst(Result, ResultLCMTy, ResultParts); - buildWidenedRemergeToDst(Overflow, OverflowLCMTy, OverflowParts); - MI.eraseFromParent(); - return Legalized; -} - -// Handle FewerElementsVector a G_BUILD_VECTOR or G_CONCAT_VECTORS that produces -// a vector -// -// Create a G_BUILD_VECTOR or G_CONCAT_VECTORS of NarrowTy pieces, padding with -// undef as necessary. -// -// %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2 -// -> <2 x s16> -// -// %4:_(s16) = G_IMPLICIT_DEF -// %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1 -// %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4 -// %7:_(<2 x s16>) = G_IMPLICIT_DEF -// %8:_(<6 x s16>) = G_CONCAT_VECTORS %5, %6, %7 -// %3:_(<3 x s16>), %8:_(<3 x s16>) = G_UNMERGE_VALUES %8 -LegalizerHelper::LegalizeResult -LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx, - LLT NarrowTy) { - Register DstReg = MI.getOperand(0).getReg(); - LLT DstTy = MRI.getType(DstReg); - LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); - LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy); - - // Break into a common type - SmallVector<Register, 16> Parts; - for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) - extractGCDType(Parts, GCDTy, MO.getReg()); + assert(TypeIdx == 0 && "Bad type index"); + if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) || + (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0)) + return UnableToLegalize; - // Build the requested new merge, padding with undef. - LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, - TargetOpcode::G_ANYEXT); + // This is most likely SrcTy (smaller then register size) packed in DstTy + // (larger then register size) and since merge was not combined it will be + // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy + // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy. - // Pack into the original result register. - buildWidenedRemergeToDst(DstReg, LCMTy, Parts); + // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4 + // + // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg + // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4 + // %0:_(DstTy) = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence + SmallVector<Register, 8> NarrowTyElts; + unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements(); + unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1; + unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts; + for (unsigned i = 0; i < NumParts; ++i) { + SmallVector<Register, 8> Sources; + for (unsigned j = 0; j < NumElts; ++j) + Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg()); + NarrowTyElts.push_back(MIRBuilder.buildMerge(NarrowTy, Sources).getReg(0)); + } + MIRBuilder.buildMerge(DstReg, NarrowTyElts); MI.eraseFromParent(); return Legalized; } @@ -4218,163 +4140,14 @@ LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx, } LegalizerHelper::LegalizeResult -LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx, - LLT NarrowTy) { - assert(TypeIdx == 0 && "only one type index expected"); - - const unsigned Opc = MI.getOpcode(); - const int NumDefOps = MI.getNumExplicitDefs(); - const int NumSrcOps = MI.getNumOperands() - NumDefOps; - const unsigned Flags = MI.getFlags(); - const unsigned NarrowSize = NarrowTy.getSizeInBits(); - const LLT NarrowScalarTy = LLT::scalar(NarrowSize); - - assert(MI.getNumOperands() <= 4 && "expected instruction with either 1 " - "result and 1-3 sources or 2 results and " - "1-2 sources"); - - SmallVector<Register, 2> DstRegs; - for (int I = 0; I < NumDefOps; ++I) - DstRegs.push_back(MI.getOperand(I).getReg()); - - // First of all check whether we are narrowing (changing the element type) - // or reducing the vector elements - const LLT DstTy = MRI.getType(DstRegs[0]); - const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType(); - - SmallVector<Register, 8> ExtractedRegs[3]; - SmallVector<Register, 8> Parts; - - // Break down all the sources into NarrowTy pieces we can operate on. This may - // involve creating merges to a wider type, padded with undef. - for (int I = 0; I != NumSrcOps; ++I) { - Register SrcReg = MI.getOperand(I + NumDefOps).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - - // The type to narrow SrcReg to. For narrowing, this is a smaller scalar. - // For fewerElements, this is a smaller vector with the same element type. - LLT OpNarrowTy; - if (IsNarrow) { - OpNarrowTy = NarrowScalarTy; - - // In case of narrowing, we need to cast vectors to scalars for this to - // work properly - // FIXME: Can we do without the bitcast here if we're narrowing? - if (SrcTy.isVector()) { - SrcTy = LLT::scalar(SrcTy.getSizeInBits()); - SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0); - } - } else { - auto NarrowEC = NarrowTy.isVector() ? NarrowTy.getElementCount() - : ElementCount::getFixed(1); - OpNarrowTy = LLT::scalarOrVector(NarrowEC, SrcTy.getScalarType()); - } - - LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg); - - // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand. - buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I], - TargetOpcode::G_ANYEXT); - } - - SmallVector<Register, 8> ResultRegs[2]; - - // Input operands for each sub-instruction. - SmallVector<SrcOp, 4> InputRegs(NumSrcOps, Register()); - - int NumParts = ExtractedRegs[0].size(); - const unsigned DstSize = DstTy.getSizeInBits(); - const LLT DstScalarTy = LLT::scalar(DstSize); - - // Narrowing needs to use scalar types - LLT DstLCMTy, NarrowDstTy; - if (IsNarrow) { - DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy); - NarrowDstTy = NarrowScalarTy; - } else { - DstLCMTy = getLCMType(DstTy, NarrowTy); - NarrowDstTy = NarrowTy; - } - - // We widened the source registers to satisfy merge/unmerge size - // constraints. We'll have some extra fully undef parts. - const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize; - - for (int I = 0; I != NumRealParts; ++I) { - // Emit this instruction on each of the split pieces. - for (int J = 0; J != NumSrcOps; ++J) - InputRegs[J] = ExtractedRegs[J][I]; - - MachineInstrBuilder Inst; - if (NumDefOps == 1) - Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags); - else - Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy, NarrowDstTy}, InputRegs, - Flags); - - for (int J = 0; J != NumDefOps; ++J) - ResultRegs[J].push_back(Inst.getReg(J)); - } - - // Fill out the widened result with undef instead of creating instructions - // with undef inputs. - int NumUndefParts = NumParts - NumRealParts; - if (NumUndefParts != 0) { - Register Undef = MIRBuilder.buildUndef(NarrowDstTy).getReg(0); - for (int I = 0; I != NumDefOps; ++I) - ResultRegs[I].append(NumUndefParts, Undef); - } - - // Extract the possibly padded result. Use a scratch register if we need to do - // a final bitcast, otherwise use the original result register. - Register MergeDstReg; - for (int I = 0; I != NumDefOps; ++I) { - if (IsNarrow && DstTy.isVector()) - MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy); - else - MergeDstReg = DstRegs[I]; - - buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs[I]); - - // Recast to vector if we narrowed a vector - if (IsNarrow && DstTy.isVector()) - MIRBuilder.buildBitcast(DstRegs[I], MergeDstReg); - } - - MI.eraseFromParent(); - return Legalized; -} - -LegalizerHelper::LegalizeResult -LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx, - LLT NarrowTy) { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - int64_t Imm = MI.getOperand(2).getImm(); - - LLT DstTy = MRI.getType(DstReg); - - SmallVector<Register, 8> Parts; - LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg); - LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts); - - for (Register &R : Parts) - R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0); - - buildWidenedRemergeToDst(DstReg, LCMTy, Parts); - - MI.eraseFromParent(); - return Legalized; -} - -LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy) { using namespace TargetOpcode; + GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI); + unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1; switch (MI.getOpcode()) { case G_IMPLICIT_DEF: - return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy); case G_TRUNC: case G_AND: case G_OR: @@ -4439,10 +4212,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_SSUBSAT: case G_UADDSAT: case G_USUBSAT: - return reduceOperationWidth(MI, TypeIdx, NarrowTy); case G_UMULO: case G_SMULO: - return fewerElementsVectorMulo(MI, TypeIdx, NarrowTy); case G_SHL: case G_LSHR: case G_ASHR: @@ -4454,7 +4225,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_CTTZ_ZERO_UNDEF: case G_CTPOP: case G_FCOPYSIGN: - return fewerElementsVectorMultiEltType(MI, TypeIdx, NarrowTy); case G_ZEXT: case G_SEXT: case G_ANYEXT: @@ -4467,14 +4237,16 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_INTTOPTR: case G_PTRTOINT: case G_ADDRSPACE_CAST: - return fewerElementsVectorCasts(MI, TypeIdx, NarrowTy); + return fewerElementsVectorMultiEltType(GMI, NumElts); case G_ICMP: case G_FCMP: - return fewerElementsVectorCmp(MI, TypeIdx, NarrowTy); + return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/}); case G_SELECT: - return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy); + if (MRI.getType(MI.getOperand(1).getReg()).isVector()) + return fewerElementsVectorMultiEltType(GMI, NumElts); + return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/}); case G_PHI: - return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy); + return fewerElementsVectorPhi(GMI, NumElts); case G_UNMERGE_VALUES: return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy); case G_BUILD_VECTOR: @@ -4491,7 +4263,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_STORE: return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy); case G_SEXT_INREG: - return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy); + return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/}); GISEL_VECREDUCE_CASES_NONSEQ return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy); case G_SHUFFLE_VECTOR: @@ -5053,6 +4825,15 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, case TargetOpcode::G_AND: case TargetOpcode::G_OR: case TargetOpcode::G_XOR: + case TargetOpcode::G_ADD: + case TargetOpcode::G_SUB: + case TargetOpcode::G_MUL: + case TargetOpcode::G_FADD: + case TargetOpcode::G_FMUL: + case TargetOpcode::G_UADDSAT: + case TargetOpcode::G_USUBSAT: + case TargetOpcode::G_SADDSAT: + case TargetOpcode::G_SSUBSAT: case TargetOpcode::G_SMIN: case TargetOpcode::G_SMAX: case TargetOpcode::G_UMIN: @@ -5070,6 +4851,17 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, Observer.changedInstr(MI); return Legalized; } + case TargetOpcode::G_FMA: + case TargetOpcode::G_FSHR: + case TargetOpcode::G_FSHL: { + Observer.changingInstr(MI); + moreElementsVectorSrc(MI, MoreTy, 1); + moreElementsVectorSrc(MI, MoreTy, 2); + moreElementsVectorSrc(MI, MoreTy, 3); + moreElementsVectorDst(MI, MoreTy, 0); + Observer.changedInstr(MI); + return Legalized; + } case TargetOpcode::G_EXTRACT: if (TypeIdx != 1) return UnableToLegalize; @@ -5079,6 +4871,11 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, return Legalized; case TargetOpcode::G_INSERT: case TargetOpcode::G_FREEZE: + case TargetOpcode::G_FNEG: + case TargetOpcode::G_FABS: + case TargetOpcode::G_BSWAP: + case TargetOpcode::G_FCANONICALIZE: + case TargetOpcode::G_SEXT_INREG: if (TypeIdx != 0) return UnableToLegalize; Observer.changingInstr(MI); @@ -5098,30 +4895,34 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx, moreElementsVectorDst(MI, MoreTy, 0); Observer.changedInstr(MI); return Legalized; - case TargetOpcode::G_UNMERGE_VALUES: { - if (TypeIdx != 1) - return UnableToLegalize; - - LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); - int NumDst = MI.getNumOperands() - 1; - moreElementsVectorSrc(MI, MoreTy, NumDst); - - auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES); - for (int I = 0; I != NumDst; ++I) - MIB.addDef(MI.getOperand(I).getReg()); + case TargetOpcode::G_UNMERGE_VALUES: + return UnableToLegalize; + case TargetOpcode::G_PHI: + return moreElementsVectorPhi(MI, TypeIdx, MoreTy); + case TargetOpcode::G_SHUFFLE_VECTOR: + return moreElementsVectorShuffle(MI, TypeIdx, MoreTy); + case TargetOpcode::G_BUILD_VECTOR: { + SmallVector<SrcOp, 8> Elts; + for (auto Op : MI.uses()) { + Elts.push_back(Op.getReg()); + } - int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits(); - for (int I = NumDst; I != NewNumDst; ++I) - MIB.addDef(MRI.createGenericVirtualRegister(DstTy)); + for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) { + Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType())); + } - MIB.addUse(MI.getOperand(NumDst).getReg()); + MIRBuilder.buildDeleteTrailingVectorElements( + MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts)); MI.eraseFromParent(); return Legalized; } - case TargetOpcode::G_PHI: - return moreElementsVectorPhi(MI, TypeIdx, MoreTy); - case TargetOpcode::G_SHUFFLE_VECTOR: - return moreElementsVectorShuffle(MI, TypeIdx, MoreTy); + case TargetOpcode::G_TRUNC: { + Observer.changingInstr(MI); + moreElementsVectorSrc(MI, MoreTy, 1); + moreElementsVectorDst(MI, MoreTy, 0); + Observer.changedInstr(MI); + return Legalized; + } default: return UnableToLegalize; } @@ -6778,6 +6579,24 @@ LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) { LLT VecTy = MRI.getType(SrcVec); LLT EltTy = VecTy.getElementType(); + unsigned NumElts = VecTy.getNumElements(); + + int64_t IdxVal; + if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) { + SmallVector<Register, 8> SrcRegs; + extractParts(SrcVec, EltTy, NumElts, SrcRegs); + + if (InsertVal) { + SrcRegs[IdxVal] = MI.getOperand(2).getReg(); + MIRBuilder.buildMerge(DstReg, SrcRegs); + } else { + MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]); + } + + MI.eraseFromParent(); + return Legalized; + } + if (!EltTy.isByteSized()) { // Not implemented. LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n"); return UnableToLegalize; @@ -6796,7 +6615,6 @@ LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) { // if the index is out of bounds. Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx); - int64_t IdxVal; if (mi_match(Idx, MRI, m_ICst(IdxVal))) { int64_t Offset = IdxVal * EltBytes; PtrInfo = PtrInfo.getWithOffset(Offset); @@ -6923,6 +6741,32 @@ LegalizerHelper::lowerExtract(MachineInstr &MI) { LLT DstTy = MRI.getType(Dst); LLT SrcTy = MRI.getType(Src); + // Extract sub-vector or one element + if (SrcTy.isVector()) { + unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits(); + unsigned DstSize = DstTy.getSizeInBits(); + + if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) && + (Offset + DstSize <= SrcTy.getSizeInBits())) { + // Unmerge and allow access to each Src element for the artifact combiner. + auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), Src); + + // Take element(s) we need to extract and copy it (merge them). + SmallVector<Register, 8> SubVectorElts; + for (unsigned Idx = Offset / SrcEltSize; + Idx < (Offset + DstSize) / SrcEltSize; ++Idx) { + SubVectorElts.push_back(Unmerge.getReg(Idx)); + } + if (SubVectorElts.size() == 1) + MIRBuilder.buildCopy(Dst, SubVectorElts[0]); + else + MIRBuilder.buildMerge(Dst, SubVectorElts); + + MI.eraseFromParent(); + return Legalized; + } + } + if (DstTy.isScalar() && (SrcTy.isScalar() || (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) { @@ -6956,6 +6800,45 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) { LLT DstTy = MRI.getType(Src); LLT InsertTy = MRI.getType(InsertSrc); + // Insert sub-vector or one element + if (DstTy.isVector() && !InsertTy.isPointer()) { + LLT EltTy = DstTy.getElementType(); + unsigned EltSize = EltTy.getSizeInBits(); + unsigned InsertSize = InsertTy.getSizeInBits(); + + if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) && + (Offset + InsertSize <= DstTy.getSizeInBits())) { + auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src); + SmallVector<Register, 8> DstElts; + unsigned Idx = 0; + // Elements from Src before insert start Offset + for (; Idx < Offset / EltSize; ++Idx) { + DstElts.push_back(UnmergeSrc.getReg(Idx)); + } + + // Replace elements in Src with elements from InsertSrc + if (InsertTy.getSizeInBits() > EltSize) { + auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc); + for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize; + ++Idx, ++i) { + DstElts.push_back(UnmergeInsertSrc.getReg(i)); + } + } else { + DstElts.push_back(InsertSrc); + ++Idx; + } + + // Remaining elements from Src after insert + for (; Idx < DstTy.getNumElements(); ++Idx) { + DstElts.push_back(UnmergeSrc.getReg(Idx)); + } + + MIRBuilder.buildMerge(Dst, DstElts); + MI.eraseFromParent(); + return Legalized; + } + } + if (InsertTy.isVector() || (DstTy.isVector() && DstTy.getElementType() != InsertTy)) return UnableToLegalize; diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp index 03dda806cb1e..de8dbd456901 100644 --- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp @@ -554,12 +554,11 @@ bool LoadStoreOpt::mergeBlockStores(MachineBasicBlock &MBB) { bool Changed = false; // Walk through the block bottom-up, looking for merging candidates. StoreMergeCandidate Candidate; - for (auto II = MBB.rbegin(), IE = MBB.rend(); II != IE; ++II) { - MachineInstr &MI = *II; + for (MachineInstr &MI : llvm::reverse(MBB)) { if (InstsToErase.contains(&MI)) continue; - if (auto StoreMI = dyn_cast<GStore>(&*II)) { + if (auto *StoreMI = dyn_cast<GStore>(&MI)) { // We have a G_STORE. Add it to the candidate if it writes to an adjacent // address. if (!addStoreToCandidate(*StoreMI, Candidate)) { diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index fb5ed35c1f72..391251886fbb 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -215,6 +215,48 @@ MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res, return buildPtrMask(Res, Op0, MaskReg); } +MachineInstrBuilder +MachineIRBuilder::buildPadVectorWithUndefElements(const DstOp &Res, + const SrcOp &Op0) { + LLT ResTy = Res.getLLTTy(*getMRI()); + LLT Op0Ty = Op0.getLLTTy(*getMRI()); + + assert((ResTy.isVector() && Op0Ty.isVector()) && "Non vector type"); + assert((ResTy.getElementType() == Op0Ty.getElementType()) && + "Different vector element types"); + assert((ResTy.getNumElements() > Op0Ty.getNumElements()) && + "Op0 has more elements"); + + auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0); + SmallVector<Register, 8> Regs; + for (auto Op : Unmerge.getInstr()->defs()) + Regs.push_back(Op.getReg()); + Register Undef = buildUndef(Op0Ty.getElementType()).getReg(0); + unsigned NumberOfPadElts = ResTy.getNumElements() - Regs.size(); + for (unsigned i = 0; i < NumberOfPadElts; ++i) + Regs.push_back(Undef); + return buildMerge(Res, Regs); +} + +MachineInstrBuilder +MachineIRBuilder::buildDeleteTrailingVectorElements(const DstOp &Res, + const SrcOp &Op0) { + LLT ResTy = Res.getLLTTy(*getMRI()); + LLT Op0Ty = Op0.getLLTTy(*getMRI()); + + assert((ResTy.isVector() && Op0Ty.isVector()) && "Non vector type"); + assert((ResTy.getElementType() == Op0Ty.getElementType()) && + "Different vector element types"); + assert((ResTy.getNumElements() < Op0Ty.getNumElements()) && + "Op0 has fewer elements"); + + SmallVector<Register, 8> Regs; + auto Unmerge = buildUnmerge(Op0Ty.getElementType(), Op0); + for (unsigned i = 0; i < ResTy.getNumElements(); ++i) + Regs.push_back(Unmerge.getReg(i)); + return buildMerge(Res, Regs); +} + MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) { return buildInstr(TargetOpcode::G_BR).addMBB(&Dest); } @@ -613,10 +655,8 @@ MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<LLT> Res, MachineInstrBuilder MachineIRBuilder::buildUnmerge(LLT Res, const SrcOp &Op) { unsigned NumReg = Op.getLLTTy(*getMRI()).getSizeInBits() / Res.getSizeInBits(); - SmallVector<Register, 8> TmpVec; - for (unsigned I = 0; I != NumReg; ++I) - TmpVec.push_back(getMRI()->createGenericVirtualRegister(Res)); - return buildUnmerge(TmpVec, Op); + SmallVector<DstOp, 8> TmpVec(NumReg, Res); + return buildInstr(TargetOpcode::G_UNMERGE_VALUES, TmpVec, Op); } MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<Register> Res, diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index b0b84763e922..4981a537dc7c 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -923,6 +923,21 @@ LLT llvm::getLCMType(LLT OrigTy, LLT TargetTy) { return LLT::scalar(LCMSize); } +LLT llvm::getCoverTy(LLT OrigTy, LLT TargetTy) { + if (!OrigTy.isVector() || !TargetTy.isVector() || OrigTy == TargetTy || + (OrigTy.getScalarSizeInBits() != TargetTy.getScalarSizeInBits())) + return getLCMType(OrigTy, TargetTy); + + unsigned OrigTyNumElts = OrigTy.getNumElements(); + unsigned TargetTyNumElts = TargetTy.getNumElements(); + if (OrigTyNumElts % TargetTyNumElts == 0) + return OrigTy; + + unsigned NumElts = alignTo(OrigTyNumElts, TargetTyNumElts); + return LLT::scalarOrVector(ElementCount::getFixed(NumElts), + OrigTy.getElementType()); +} + LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) { const unsigned OrigSize = OrigTy.getSizeInBits(); const unsigned TargetSize = TargetTy.getSizeInBits(); @@ -1184,25 +1199,6 @@ bool llvm::shouldOptForSize(const MachineBasicBlock &MBB, llvm::shouldOptimizeForSize(MBB.getBasicBlock(), PSI, BFI); } -/// These artifacts generally don't have any debug users because they don't -/// directly originate from IR instructions, but instead usually from -/// legalization. Avoiding checking for debug users improves compile time. -/// Note that truncates or extends aren't included because they have IR -/// counterparts which can have debug users after translation. -static bool shouldSkipDbgValueFor(MachineInstr &MI) { - switch (MI.getOpcode()) { - case TargetOpcode::G_UNMERGE_VALUES: - case TargetOpcode::G_MERGE_VALUES: - case TargetOpcode::G_CONCAT_VECTORS: - case TargetOpcode::G_BUILD_VECTOR: - case TargetOpcode::G_EXTRACT: - case TargetOpcode::G_INSERT: - return true; - default: - return false; - } -} - void llvm::saveUsesAndErase(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver, SmallInstListTy &DeadInstChain) { @@ -1212,10 +1208,7 @@ void llvm::saveUsesAndErase(MachineInstr &MI, MachineRegisterInfo &MRI, } LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n"); DeadInstChain.remove(&MI); - if (shouldSkipDbgValueFor(MI)) - MI.eraseFromParent(); - else - MI.eraseFromParentAndMarkDBGValuesForRemoval(); + MI.eraseFromParent(); if (LocObserver) LocObserver->checkpoint(false); } diff --git a/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/llvm/lib/CodeGen/ImplicitNullChecks.cpp index 0882ce366c9c..fc97938ccd3e 100644 --- a/llvm/lib/CodeGen/ImplicitNullChecks.cpp +++ b/llvm/lib/CodeGen/ImplicitNullChecks.cpp @@ -242,7 +242,7 @@ bool ImplicitNullChecks::canHandle(const MachineInstr *MI) { auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); }; (void)IsRegMask; - assert(!llvm::any_of(MI->operands(), IsRegMask) && + assert(llvm::none_of(MI->operands(), IsRegMask) && "Calls were filtered out above!"); auto IsUnordered = [](MachineMemOperand *MMO) { return MMO->isUnordered(); }; diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index fc5ac45752ca..c975013db8c8 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -686,9 +686,7 @@ void InlineSpiller::reMaterializeAll() { // Remove any values that were completely rematted. for (Register Reg : RegsToSpill) { LiveInterval &LI = LIS.getInterval(Reg); - for (LiveInterval::vni_iterator I = LI.vni_begin(), E = LI.vni_end(); - I != E; ++I) { - VNInfo *VNI = *I; + for (VNInfo *VNI : llvm::make_range(LI.vni_begin(), LI.vni_end())) { if (VNI->isUnused() || VNI->isPHIDef() || UsedValues.count(VNI)) continue; MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def); diff --git a/llvm/lib/CodeGen/InterferenceCache.cpp b/llvm/lib/CodeGen/InterferenceCache.cpp index a56485cdbc67..3cab9e5734ee 100644 --- a/llvm/lib/CodeGen/InterferenceCache.cpp +++ b/llvm/lib/CodeGen/InterferenceCache.cpp @@ -56,8 +56,8 @@ void InterferenceCache::init(MachineFunction *mf, LIUArray = liuarray; TRI = tri; reinitPhysRegEntries(); - for (unsigned i = 0; i != CacheEntries; ++i) - Entries[i].clear(mf, indexes, lis); + for (Entry &E : Entries) + E.clear(mf, indexes, lis); } InterferenceCache::Entry *InterferenceCache::get(MCRegister PhysReg) { diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index cf62b0e5d7e8..e97dcca201e8 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -1249,8 +1249,8 @@ bool InstrRefBasedLDV::transferDebugPHI(MachineInstr &MI) { std::array<unsigned, 4> CandidateSizes = {64, 32, 16, 8}; Optional<ValueIDNum> Result = None; Optional<LocIdx> SpillLoc = None; - for (unsigned int I = 0; I < CandidateSizes.size(); ++I) { - unsigned SpillID = MTracker->getLocID(SpillNo, {CandidateSizes[I], 0}); + for (unsigned CS : CandidateSizes) { + unsigned SpillID = MTracker->getLocID(SpillNo, {CS, 0}); SpillLoc = MTracker->getSpillMLoc(SpillID); ValueIDNum Val = MTracker->readMLoc(*SpillLoc); // If this value was defined in it's own position, then it was probably diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp index a632d3d9ce76..b4dd41bbb810 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp @@ -492,10 +492,10 @@ private: static VarLoc CreateCopyLoc(const VarLoc &OldVL, const MachineLoc &OldML, Register NewReg) { VarLoc VL = OldVL; - for (size_t I = 0, E = VL.Locs.size(); I < E; ++I) - if (VL.Locs[I] == OldML) { - VL.Locs[I].Kind = MachineLocKind::RegisterKind; - VL.Locs[I].Value.RegNo = NewReg; + for (MachineLoc &ML : VL.Locs) + if (ML == OldML) { + ML.Kind = MachineLocKind::RegisterKind; + ML.Value.RegNo = NewReg; return VL; } llvm_unreachable("Should have found OldML in new VarLoc."); @@ -506,10 +506,10 @@ private: static VarLoc CreateSpillLoc(const VarLoc &OldVL, const MachineLoc &OldML, unsigned SpillBase, StackOffset SpillOffset) { VarLoc VL = OldVL; - for (int I = 0, E = VL.Locs.size(); I < E; ++I) - if (VL.Locs[I] == OldML) { - VL.Locs[I].Kind = MachineLocKind::SpillLocKind; - VL.Locs[I].Value.SpillLocation = {SpillBase, SpillOffset}; + for (MachineLoc &ML : VL.Locs) + if (ML == OldML) { + ML.Kind = MachineLocKind::SpillLocKind; + ML.Value.SpillLocation = {SpillBase, SpillOffset}; return VL; } llvm_unreachable("Should have found OldML in new VarLoc."); diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp index 5f976bf43c5b..e6661e5135c3 100644 --- a/llvm/lib/CodeGen/LiveDebugVariables.cpp +++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp @@ -822,9 +822,6 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) { // register that hasn't been defined yet. If we do not remove those here, then // the re-insertion of the DBG_VALUE instruction after register allocation // will be incorrect. - // TODO: If earlier passes are corrected to generate sane debug information - // (and if the machine verifier is improved to catch this), then these checks - // could be removed or replaced by asserts. bool Discard = false; for (const MachineOperand &Op : MI.debug_operands()) { if (Op.isReg() && Register::isVirtualRegister(Op.getReg())) { @@ -1341,8 +1338,8 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<Register> NewRegs, bool DidChange = false; LocMap::iterator LocMapI; LocMapI.setMap(locInts); - for (unsigned i = 0; i != NewRegs.size(); ++i) { - LiveInterval *LI = &LIS.getInterval(NewRegs[i]); + for (Register NewReg : NewRegs) { + LiveInterval *LI = &LIS.getInterval(NewReg); if (LI->empty()) continue; @@ -1500,8 +1497,8 @@ void LDVImpl::splitRegister(Register OldReg, ArrayRef<Register> NewRegs) { // Map all of the new virtual registers. UserValue *UV = lookupVirtReg(OldReg); - for (unsigned i = 0; i != NewRegs.size(); ++i) - mapVirtReg(NewRegs[i], UV); + for (Register NewReg : NewRegs) + mapVirtReg(NewReg, UV); } void LiveDebugVariables:: diff --git a/llvm/lib/CodeGen/LiveDebugVariables.h b/llvm/lib/CodeGen/LiveDebugVariables.h index 07dd3a83866f..9998ce9e8dad 100644 --- a/llvm/lib/CodeGen/LiveDebugVariables.h +++ b/llvm/lib/CodeGen/LiveDebugVariables.h @@ -56,6 +56,11 @@ private: bool runOnMachineFunction(MachineFunction &) override; void releaseMemory() override; void getAnalysisUsage(AnalysisUsage &) const override; + + MachineFunctionProperties getSetProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::TracksDebugUserValues); + } }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 6380c4bfd6e6..05768140cbdf 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -133,6 +133,22 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI, if (OVNI != li.getVNInfoAt(UseIdx)) return false; + + // Check that subrange is live at UseIdx. + if (MO.getSubReg()) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + LaneBitmask LM = TRI->getSubRegIndexLaneMask(MO.getSubReg()); + for (LiveInterval::SubRange &SR : li.subranges()) { + if ((SR.LaneMask & LM).none()) + continue; + if (!SR.liveAt(UseIdx)) + return false; + // Early exit if all used lanes are checked. No need to continue. + LM &= ~SR.LaneMask; + if (LM.none()) + break; + } + } } return true; } diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp index e8744797707b..94bdfab5e5e0 100644 --- a/llvm/lib/CodeGen/LiveVariables.cpp +++ b/llvm/lib/CodeGen/LiveVariables.cpp @@ -141,8 +141,8 @@ void LiveVariables::HandleVirtRegUse(Register Reg, MachineBasicBlock *MBB, } #ifndef NDEBUG - for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i) - assert(VRInfo.Kills[i]->getParent() != MBB && "entry should be at end!"); + for (MachineInstr *Kill : VRInfo.Kills) + assert(Kill->getParent() != MBB && "entry should be at end!"); #endif // This situation can occur: @@ -534,8 +534,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI, MachineBasicBlock *MBB = MI.getParent(); // Process all uses. - for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) { - unsigned MOReg = UseRegs[i]; + for (unsigned MOReg : UseRegs) { if (Register::isVirtualRegister(MOReg)) HandleVirtRegUse(MOReg, MBB, MI); else if (!MRI->isReserved(MOReg)) @@ -543,12 +542,11 @@ void LiveVariables::runOnInstr(MachineInstr &MI, } // Process all masked registers. (Call clobbers). - for (unsigned i = 0, e = RegMasks.size(); i != e; ++i) - HandleRegMask(MI.getOperand(RegMasks[i])); + for (unsigned Mask : RegMasks) + HandleRegMask(MI.getOperand(Mask)); // Process all defs. - for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) { - unsigned MOReg = DefRegs[i]; + for (unsigned MOReg : DefRegs) { if (Register::isVirtualRegister(MOReg)) HandleVirtRegDef(MOReg, MI); else if (!MRI->isReserved(MOReg)) diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp index ee2387d1e8e6..37fd3e4853ac 100644 --- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp +++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp @@ -210,7 +210,11 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) { StackObjSet SmallArrayObjs; StackObjSet AddrOfObjs; - AdjustStackOffset(MFI, StackProtectorFI, Offset, StackGrowsDown, MaxAlign); + // Only place the stack protector in the local stack area if the target + // allows it. + if (TFI.isStackIdSafeForLocalArea(MFI.getStackID(StackProtectorFI))) + AdjustStackOffset(MFI, StackProtectorFI, Offset, StackGrowsDown, + MaxAlign); // Assign large stack objects first. for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 6221b5929301..d0323eaf3d78 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -350,18 +350,33 @@ void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) { bool HasPHI = false; bool HasInlineAsm = false; + bool AllTiedOpsRewritten = true, HasTiedOps = false; for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { if (MI.isPHI()) HasPHI = true; if (MI.isInlineAsm()) HasInlineAsm = true; + for (unsigned I = 0; I < MI.getNumOperands(); ++I) { + const MachineOperand &MO = MI.getOperand(I); + if (!MO.isReg() || !MO.getReg()) + continue; + unsigned DefIdx; + if (MO.isUse() && MI.isRegTiedToDefOperand(I, &DefIdx)) { + HasTiedOps = true; + if (MO.getReg() != MI.getOperand(DefIdx).getReg()) + AllTiedOpsRewritten = false; + } + } } } if (!HasPHI) Properties.set(MachineFunctionProperties::Property::NoPHIs); MF.setHasInlineAsm(HasInlineAsm); + if (HasTiedOps && AllTiedOpsRewritten) + Properties.set(MachineFunctionProperties::Property::TiedOpsRewritten); + if (isSSA(MF)) Properties.set(MachineFunctionProperties::Property::IsSSA); else @@ -457,6 +472,9 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, if (YamlMF.FailsVerification) MF.getProperties().set( MachineFunctionProperties::Property::FailsVerification); + if (YamlMF.TracksDebugUserValues) + MF.getProperties().set( + MachineFunctionProperties::Property::TracksDebugUserValues); PerFunctionMIParsingState PFS(MF, SM, IRSlots, *Target); if (parseRegisterInfo(PFS, YamlMF)) diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index f1369396e37f..dc72f83ad0e4 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -219,6 +219,8 @@ void MIRPrinter::print(const MachineFunction &MF) { MachineFunctionProperties::Property::FailedISel); YamlMF.FailsVerification = MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailsVerification); + YamlMF.TracksDebugUserValues = MF.getProperties().hasProperty( + MachineFunctionProperties::Property::TracksDebugUserValues); convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo()); MachineModuleSlotTracker MST(&MF); diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 23c511aaa056..8c9d00d08c6a 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -193,7 +193,7 @@ void ilist_traits<MachineInstr>::transferNodesFromList(ilist_traits &FromList, void ilist_traits<MachineInstr>::deleteNode(MachineInstr *MI) { assert(!MI->getParent() && "MI is still in a block!"); - Parent->getParent()->DeleteMachineInstr(MI); + Parent->getParent()->deleteMachineInstr(MI); } MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() { @@ -1038,16 +1038,15 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( // Collect a list of virtual registers killed by the terminators. SmallVector<Register, 4> KilledRegs; if (LV) - for (instr_iterator I = getFirstInstrTerminator(), E = instr_end(); - I != E; ++I) { - MachineInstr *MI = &*I; - for (MachineOperand &MO : MI->operands()) { + for (MachineInstr &MI : + llvm::make_range(getFirstInstrTerminator(), instr_end())) { + for (MachineOperand &MO : MI.operands()) { if (!MO.isReg() || MO.getReg() == 0 || !MO.isUse() || !MO.isKill() || MO.isUndef()) continue; Register Reg = MO.getReg(); if (Register::isPhysicalRegister(Reg) || - LV->getVarInfo(Reg).removeKill(*MI)) { + LV->getVarInfo(Reg).removeKill(MI)) { KilledRegs.push_back(Reg); LLVM_DEBUG(dbgs() << "Removing terminator kill: " << MI); MO.setIsKill(false); @@ -1057,11 +1056,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( SmallVector<Register, 4> UsedRegs; if (LIS) { - for (instr_iterator I = getFirstInstrTerminator(), E = instr_end(); - I != E; ++I) { - MachineInstr *MI = &*I; - - for (const MachineOperand &MO : MI->operands()) { + for (MachineInstr &MI : + llvm::make_range(getFirstInstrTerminator(), instr_end())) { + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg() || MO.getReg() == 0) continue; @@ -1078,9 +1075,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( // SlotIndexes. SmallVector<MachineInstr*, 4> Terminators; if (Indexes) { - for (instr_iterator I = getFirstInstrTerminator(), E = instr_end(); - I != E; ++I) - Terminators.push_back(&*I); + for (MachineInstr &MI : + llvm::make_range(getFirstInstrTerminator(), instr_end())) + Terminators.push_back(&MI); } // Since we replaced all uses of Succ with NMBB, that should also be treated @@ -1091,9 +1088,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( if (Indexes) { SmallVector<MachineInstr*, 4> NewTerminators; - for (instr_iterator I = getFirstInstrTerminator(), E = instr_end(); - I != E; ++I) - NewTerminators.push_back(&*I); + for (MachineInstr &MI : + llvm::make_range(getFirstInstrTerminator(), instr_end())) + NewTerminators.push_back(&MI); for (MachineInstr *Terminator : Terminators) { if (!is_contained(NewTerminators, Terminator)) diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 8a1b4031642d..692587cd58fa 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -61,6 +61,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -193,6 +194,11 @@ static cl::opt<unsigned> TriangleChainCount( cl::init(2), cl::Hidden); +static cl::opt<bool> EnableExtTspBlockPlacement( + "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false), + cl::desc("Enable machine block placement based on the ext-tsp model, " + "optimizing I-cache utilization.")); + namespace llvm { extern cl::opt<unsigned> StaticLikelyProb; extern cl::opt<unsigned> ProfileLikelyProb; @@ -557,6 +563,15 @@ class MachineBlockPlacement : public MachineFunctionPass { /// but a local analysis would not find them. void precomputeTriangleChains(); + /// Apply a post-processing step optimizing block placement. + void applyExtTsp(); + + /// Modify the existing block placement in the function and adjust all jumps. + void assignBlockOrder(const std::vector<const MachineBasicBlock *> &NewOrder); + + /// Create a single CFG chain from the current block order. + void createCFGChainExtTsp(); + public: static char ID; // Pass identification, replacement for typeid @@ -3387,6 +3402,15 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { } } + // Apply a post-processing optimizing block placement. + if (MF.size() >= 3 && EnableExtTspBlockPlacement) { + // Find a new placement and modify the layout of the blocks in the function. + applyExtTsp(); + + // Re-create CFG chain so that we can optimizeBranches and alignBlocks. + createCFGChainExtTsp(); + } + optimizeBranches(); alignBlocks(); @@ -3413,12 +3437,147 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { MBFI->view("MBP." + MF.getName(), false); } - // We always return true as we have no way to track whether the final order // differs from the original order. return true; } +void MachineBlockPlacement::applyExtTsp() { + // Prepare data; blocks are indexed by their index in the current ordering. + DenseMap<const MachineBasicBlock *, uint64_t> BlockIndex; + BlockIndex.reserve(F->size()); + std::vector<const MachineBasicBlock *> CurrentBlockOrder; + CurrentBlockOrder.reserve(F->size()); + size_t NumBlocks = 0; + for (const MachineBasicBlock &MBB : *F) { + BlockIndex[&MBB] = NumBlocks++; + CurrentBlockOrder.push_back(&MBB); + } + + auto BlockSizes = std::vector<uint64_t>(F->size()); + auto BlockCounts = std::vector<uint64_t>(F->size()); + DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> JumpCounts; + for (MachineBasicBlock &MBB : *F) { + // Getting the block frequency. + BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); + BlockCounts[BlockIndex[&MBB]] = BlockFreq.getFrequency(); + // Getting the block size: + // - approximate the size of an instruction by 4 bytes, and + // - ignore debug instructions. + // Note: getting the exact size of each block is target-dependent and can be + // done by extending the interface of MCCodeEmitter. Experimentally we do + // not see a perf improvement with the exact block sizes. + auto NonDbgInsts = + instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end()); + int NumInsts = std::distance(NonDbgInsts.begin(), NonDbgInsts.end()); + BlockSizes[BlockIndex[&MBB]] = 4 * NumInsts; + // Getting jump frequencies. + for (MachineBasicBlock *Succ : MBB.successors()) { + auto EP = MBPI->getEdgeProbability(&MBB, Succ); + BlockFrequency EdgeFreq = BlockFreq * EP; + auto Edge = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]); + JumpCounts[Edge] = EdgeFreq.getFrequency(); + } + } + + LLVM_DEBUG(dbgs() << "Applying ext-tsp layout for |V| = " << F->size() + << " with profile = " << F->getFunction().hasProfileData() + << " (" << F->getName().str() << ")" + << "\n"); + LLVM_DEBUG( + dbgs() << format(" original layout score: %0.2f\n", + calcExtTspScore(BlockSizes, BlockCounts, JumpCounts))); + + // Run the layout algorithm. + auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + std::vector<const MachineBasicBlock *> NewBlockOrder; + NewBlockOrder.reserve(F->size()); + for (uint64_t Node : NewOrder) { + NewBlockOrder.push_back(CurrentBlockOrder[Node]); + } + LLVM_DEBUG(dbgs() << format(" optimized layout score: %0.2f\n", + calcExtTspScore(NewOrder, BlockSizes, BlockCounts, + JumpCounts))); + + // Assign new block order. + assignBlockOrder(NewBlockOrder); +} + +void MachineBlockPlacement::assignBlockOrder( + const std::vector<const MachineBasicBlock *> &NewBlockOrder) { + assert(F->size() == NewBlockOrder.size() && "Incorrect size of block order"); + F->RenumberBlocks(); + + bool HasChanges = false; + for (size_t I = 0; I < NewBlockOrder.size(); I++) { + if (NewBlockOrder[I] != F->getBlockNumbered(I)) { + HasChanges = true; + break; + } + } + // Stop early if the new block order is identical to the existing one. + if (!HasChanges) + return; + + SmallVector<MachineBasicBlock *, 4> PrevFallThroughs(F->getNumBlockIDs()); + for (auto &MBB : *F) { + PrevFallThroughs[MBB.getNumber()] = MBB.getFallThrough(); + } + + // Sort basic blocks in the function according to the computed order. + DenseMap<const MachineBasicBlock *, size_t> NewIndex; + for (const MachineBasicBlock *MBB : NewBlockOrder) { + NewIndex[MBB] = NewIndex.size(); + } + F->sort([&](MachineBasicBlock &L, MachineBasicBlock &R) { + return NewIndex[&L] < NewIndex[&R]; + }); + + // Update basic block branches by inserting explicit fallthrough branches + // when required and re-optimize branches when possible. + const TargetInstrInfo *TII = F->getSubtarget().getInstrInfo(); + SmallVector<MachineOperand, 4> Cond; + for (auto &MBB : *F) { + MachineFunction::iterator NextMBB = std::next(MBB.getIterator()); + MachineFunction::iterator EndIt = MBB.getParent()->end(); + auto *FTMBB = PrevFallThroughs[MBB.getNumber()]; + // If this block had a fallthrough before we need an explicit unconditional + // branch to that block if the fallthrough block is not adjacent to the + // block in the new order. + if (FTMBB && (NextMBB == EndIt || &*NextMBB != FTMBB)) { + TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc()); + } + + // It might be possible to optimize branches by flipping the condition. + Cond.clear(); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (TII->analyzeBranch(MBB, TBB, FBB, Cond)) + continue; + MBB.updateTerminator(FTMBB); + } + +#ifndef NDEBUG + // Make sure we correctly constructed all branches. + F->verify(this, "After optimized block reordering"); +#endif +} + +void MachineBlockPlacement::createCFGChainExtTsp() { + BlockToChain.clear(); + ComputedEdges.clear(); + ChainAllocator.DestroyAll(); + + MachineBasicBlock *HeadBB = &F->front(); + BlockChain *FunctionChain = + new (ChainAllocator.Allocate()) BlockChain(BlockToChain, HeadBB); + + for (MachineBasicBlock &MBB : *F) { + if (HeadBB == &MBB) + continue; // Ignore head of the chain + FunctionChain->merge(&MBB, nullptr); + } +} + namespace { /// A pass to compute block placement statistics. diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp index e2b6cfe55c16..72ab9ee4f388 100644 --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -485,7 +485,7 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI, MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr); for (auto *InstrPtr : DelInstrs) { - InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval(); + InstrPtr->eraseFromParent(); // Erase all LiveRegs defined by the removed instruction for (auto I = RegUnits.begin(); I != RegUnits.end(); ) { if (I->MI == InstrPtr) @@ -693,7 +693,7 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { // use for them. MachineFunction *MF = MBB->getParent(); for (auto *InstrPtr : InsInstrs) - MF->DeleteMachineInstr(InstrPtr); + MF->deleteMachineInstr(InstrPtr); } InstrIdxForVirtReg.clear(); } diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 7c83bacd80d9..57fbe4112e47 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -847,31 +847,27 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName() << "\n"); - for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); - I != E;) { - MachineInstr *MI = &*I; - ++I; - + for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) { // Ignore non-trivial COPYs. - if (MI->isCopy() && MI->getNumOperands() == 2 && - !TRI->regsOverlap(MI->getOperand(0).getReg(), - MI->getOperand(1).getReg())) { + if (MI.isCopy() && MI.getNumOperands() == 2 && + !TRI->regsOverlap(MI.getOperand(0).getReg(), + MI.getOperand(1).getReg())) { - MCRegister Def = MI->getOperand(0).getReg().asMCReg(); - MCRegister Src = MI->getOperand(1).getReg().asMCReg(); + MCRegister Def = MI.getOperand(0).getReg().asMCReg(); + MCRegister Src = MI.getOperand(1).getReg().asMCReg(); // Unlike forward cp, we don't invoke propagateDefs here, // just let forward cp do COPY-to-COPY propagation. - if (isBackwardPropagatableCopy(*MI, *MRI)) { + if (isBackwardPropagatableCopy(MI, *MRI)) { Tracker.invalidateRegister(Src, *TRI); Tracker.invalidateRegister(Def, *TRI); - Tracker.trackCopy(MI, *TRI); + Tracker.trackCopy(&MI, *TRI); continue; } } // Invalidate any earlyclobber regs first. - for (const MachineOperand &MO : MI->operands()) + for (const MachineOperand &MO : MI.operands()) if (MO.isReg() && MO.isEarlyClobber()) { MCRegister Reg = MO.getReg().asMCReg(); if (!Reg) @@ -879,8 +875,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Tracker.invalidateRegister(Reg, *TRI); } - propagateDefs(*MI); - for (const MachineOperand &MO : MI->operands()) { + propagateDefs(MI); + for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; @@ -898,7 +894,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( for (MCRegUnitIterator RUI(MO.getReg().asMCReg(), TRI); RUI.isValid(); ++RUI) { if (auto *Copy = Tracker.findCopyDefViaUnit(*RUI, *TRI)) { - CopyDbgUsers[Copy].insert(MI); + CopyDbgUsers[Copy].insert(&MI); } } } else { diff --git a/llvm/lib/CodeGen/MachineCycleAnalysis.cpp b/llvm/lib/CodeGen/MachineCycleAnalysis.cpp new file mode 100644 index 000000000000..42a5e2b7af01 --- /dev/null +++ b/llvm/lib/CodeGen/MachineCycleAnalysis.cpp @@ -0,0 +1,113 @@ +//===- MachineCycleAnalysis.cpp - Compute CycleInfo for Machine IR --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineCycleAnalysis.h" +#include "llvm/ADT/GenericCycleImpl.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineSSAContext.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +template class llvm::GenericCycleInfo<llvm::MachineSSAContext>; +template class llvm::GenericCycle<llvm::MachineSSAContext>; + +namespace { + +/// Legacy analysis pass which computes a \ref MachineCycleInfo. +class MachineCycleInfoWrapperPass : public MachineFunctionPass { + MachineFunction *F = nullptr; + MachineCycleInfo CI; + +public: + static char ID; + + MachineCycleInfoWrapperPass(); + + MachineCycleInfo &getCycleInfo() { return CI; } + const MachineCycleInfo &getCycleInfo() const { return CI; } + + bool runOnMachineFunction(MachineFunction &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + void releaseMemory() override; + void print(raw_ostream &OS, const Module *M = nullptr) const override; + + // TODO: verify analysis +}; + +class MachineCycleInfoPrinterPass : public MachineFunctionPass { +public: + static char ID; + + MachineCycleInfoPrinterPass(); + + bool runOnMachineFunction(MachineFunction &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +} // namespace + +char MachineCycleInfoWrapperPass::ID = 0; + +MachineCycleInfoWrapperPass::MachineCycleInfoWrapperPass() + : MachineFunctionPass(ID) { + initializeMachineCycleInfoWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +INITIALIZE_PASS_BEGIN(MachineCycleInfoWrapperPass, "machine-cycles", + "Machine Cycle Info Analysis", true, true) +INITIALIZE_PASS_END(MachineCycleInfoWrapperPass, "machine-cycles", + "Machine Cycle Info Analysis", true, true) + +void MachineCycleInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +bool MachineCycleInfoWrapperPass::runOnMachineFunction(MachineFunction &Func) { + CI.clear(); + + F = &Func; + CI.compute(Func); + return false; +} + +void MachineCycleInfoWrapperPass::print(raw_ostream &OS, const Module *) const { + OS << "MachineCycleInfo for function: " << F->getName() << "\n"; + CI.print(OS); +} + +void MachineCycleInfoWrapperPass::releaseMemory() { + CI.clear(); + F = nullptr; +} + +char MachineCycleInfoPrinterPass::ID = 0; + +MachineCycleInfoPrinterPass::MachineCycleInfoPrinterPass() + : MachineFunctionPass(ID) { + initializeMachineCycleInfoPrinterPassPass(*PassRegistry::getPassRegistry()); +} + +INITIALIZE_PASS_BEGIN(MachineCycleInfoPrinterPass, "print-machine-cycles", + "Print Machine Cycle Info Analysis", true, true) +INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass) +INITIALIZE_PASS_END(MachineCycleInfoPrinterPass, "print-machine-cycles", + "Print Machine Cycle Info Analysis", true, true) + +void MachineCycleInfoPrinterPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired<MachineCycleInfoWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +bool MachineCycleInfoPrinterPass::runOnMachineFunction(MachineFunction &F) { + auto &CI = getAnalysis<MachineCycleInfoWrapperPass>(); + CI.print(errs()); + return false; +} diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 310c2721c3bd..81ed3d0e93ff 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -89,6 +89,7 @@ static cl::opt<unsigned> AlignAllFunctions( static const char *getPropertyName(MachineFunctionProperties::Property Prop) { using P = MachineFunctionProperties::Property; + // clang-format off switch(Prop) { case P::FailedISel: return "FailedISel"; case P::IsSSA: return "IsSSA"; @@ -100,7 +101,9 @@ static const char *getPropertyName(MachineFunctionProperties::Property Prop) { case P::TracksLiveness: return "TracksLiveness"; case P::TiedOpsRewritten: return "TiedOpsRewritten"; case P::FailsVerification: return "FailsVerification"; + case P::TracksDebugUserValues: return "TracksDebugUserValues"; } + // clang-format on llvm_unreachable("Invalid machine function property"); } @@ -125,7 +128,7 @@ void MachineFunctionProperties::print(raw_ostream &OS) const { MachineFunctionInfo::~MachineFunctionInfo() = default; void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) { - MBB->getParent()->DeleteMachineBasicBlock(MBB); + MBB->getParent()->deleteMachineBasicBlock(MBB); } static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI, @@ -347,10 +350,10 @@ void MachineFunction::assignBeginEndSections() { /// Allocate a new MachineInstr. Use this instead of `new MachineInstr'. MachineInstr *MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID, - const DebugLoc &DL, + DebugLoc DL, bool NoImplicit) { return new (InstructionRecycler.Allocate<MachineInstr>(Allocator)) - MachineInstr(*this, MCID, DL, NoImplicit); + MachineInstr(*this, MCID, std::move(DL), NoImplicit); } /// Create a new MachineInstr which is a copy of the 'Orig' instruction, @@ -361,8 +364,9 @@ MachineFunction::CloneMachineInstr(const MachineInstr *Orig) { MachineInstr(*this, *Orig); } -MachineInstr &MachineFunction::CloneMachineInstrBundle(MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, const MachineInstr &Orig) { +MachineInstr &MachineFunction::cloneMachineInstrBundle( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + const MachineInstr &Orig) { MachineInstr *FirstClone = nullptr; MachineBasicBlock::const_instr_iterator I = Orig.getIterator(); while (true) { @@ -390,8 +394,7 @@ MachineInstr &MachineFunction::CloneMachineInstrBundle(MachineBasicBlock &MBB, /// /// This function also serves as the MachineInstr destructor - the real /// ~MachineInstr() destructor must be empty. -void -MachineFunction::DeleteMachineInstr(MachineInstr *MI) { +void MachineFunction::deleteMachineInstr(MachineInstr *MI) { // Verify that a call site info is at valid state. This assertion should // be triggered during the implementation of support for the // call site info of a new architecture. If the assertion is triggered, @@ -418,8 +421,7 @@ MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) { } /// Delete the given MachineBasicBlock. -void -MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) { +void MachineFunction::deleteMachineBasicBlock(MachineBasicBlock *MBB) { assert(MBB->getParent() == this && "MBB parent mismatch!"); // Clean up any references to MBB in jump tables before deleting it. if (JumpTableInfo) @@ -769,8 +771,8 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) { void MachineFunction::addCatchTypeInfo(MachineBasicBlock *LandingPad, ArrayRef<const GlobalValue *> TyInfo) { LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad); - for (unsigned N = TyInfo.size(); N; --N) - LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1])); + for (const GlobalValue *GV : llvm::reverse(TyInfo)) + LP.TypeIds.push_back(getTypeIDFor(GV)); } void MachineFunction::addFilterTypeInfo(MachineBasicBlock *LandingPad, @@ -1404,10 +1406,10 @@ MachineConstantPool::~MachineConstantPool() { // A constant may be a member of both Constants and MachineCPVsSharingEntries, // so keep track of which we've deleted to avoid double deletions. DenseSet<MachineConstantPoolValue*> Deleted; - for (unsigned i = 0, e = Constants.size(); i != e; ++i) - if (Constants[i].isMachineConstantPoolEntry()) { - Deleted.insert(Constants[i].Val.MachineCPVal); - delete Constants[i].Val.MachineCPVal; + for (const MachineConstantPoolEntry &C : Constants) + if (C.isMachineConstantPoolEntry()) { + Deleted.insert(C.Val.MachineCPVal); + delete C.Val.MachineCPVal; } for (MachineConstantPoolValue *CPV : MachineCPVsSharingEntries) { if (Deleted.count(CPV) == 0) diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index aaa80432d2f2..85b266afceef 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -115,10 +115,10 @@ void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) { /// MachineInstr ctor - This constructor creates a MachineInstr and adds the /// implicit operands. It reserves space for the number of operands specified by /// the MCInstrDesc. -MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, - DebugLoc dl, bool NoImp) - : MCID(&tid), debugLoc(std::move(dl)), DebugInstrNum(0) { - assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); +MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &TID, + DebugLoc DL, bool NoImp) + : MCID(&TID), DbgLoc(std::move(DL)), DebugInstrNum(0) { + assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor"); // Reserve space for the expected number of operands. if (unsigned NumOps = MCID->getNumOperands() + @@ -135,9 +135,9 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid, /// Does not copy the number from debug instruction numbering, to preserve /// uniqueness. MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI) - : MCID(&MI.getDesc()), Info(MI.Info), debugLoc(MI.getDebugLoc()), + : MCID(&MI.getDesc()), Info(MI.Info), DbgLoc(MI.getDebugLoc()), DebugInstrNum(0) { - assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor"); + assert(DbgLoc.hasTrivialDestructor() && "Expected trivial destructor"); CapOperands = OperandCapacity::get(MI.getNumOperands()); Operands = MF.allocateOperandArray(CapOperands); @@ -682,26 +682,6 @@ void MachineInstr::eraseFromParent() { getParent()->erase(this); } -void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() { - assert(getParent() && "Not embedded in a basic block!"); - MachineBasicBlock *MBB = getParent(); - MachineFunction *MF = MBB->getParent(); - assert(MF && "Not embedded in a function!"); - - MachineInstr *MI = (MachineInstr *)this; - MachineRegisterInfo &MRI = MF->getRegInfo(); - - for (const MachineOperand &MO : MI->operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; - Register Reg = MO.getReg(); - if (!Reg.isVirtual()) - continue; - MRI.markUsesInDebugValueAsUndef(Reg); - } - MI->eraseFromParent(); -} - void MachineInstr::eraseFromBundle() { assert(getParent() && "Not embedded in a basic block!"); getParent()->erase_instr(this); diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 8d6459a627fa..762395542b40 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -649,7 +649,7 @@ void SwingSchedulerDAG::schedule() { /// Clean up after the software pipeliner runs. void SwingSchedulerDAG::finishBlock() { for (auto &KV : NewMIs) - MF.DeleteMachineInstr(KV.second); + MF.deleteMachineInstr(KV.second); NewMIs.clear(); // Call the superclass. @@ -1101,17 +1101,15 @@ unsigned SwingSchedulerDAG::calculateResMII() { // Sort the instructions by the number of available choices for scheduling, // least to most. Use the number of critical resources as the tie breaker. FuncUnitSorter FUS = FuncUnitSorter(MF.getSubtarget()); - for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(), - E = MBB->getFirstTerminator(); - I != E; ++I) - FUS.calcCriticalResources(*I); + for (MachineInstr &MI : + llvm::make_range(MBB->getFirstNonPHI(), MBB->getFirstTerminator())) + FUS.calcCriticalResources(MI); PriorityQueue<MachineInstr *, std::vector<MachineInstr *>, FuncUnitSorter> FuncUnitOrder(FUS); - for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(), - E = MBB->getFirstTerminator(); - I != E; ++I) - FuncUnitOrder.push(&*I); + for (MachineInstr &MI : + llvm::make_range(MBB->getFirstNonPHI(), MBB->getFirstTerminator())) + FuncUnitOrder.push(&MI); while (!FuncUnitOrder.empty()) { MachineInstr *MI = FuncUnitOrder.top(); @@ -1192,14 +1190,10 @@ unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) { /// but we do this to find the circuits, and then change them back. static void swapAntiDependences(std::vector<SUnit> &SUnits) { SmallVector<std::pair<SUnit *, SDep>, 8> DepsAdded; - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { - SUnit *SU = &SUnits[i]; - for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end(); - IP != EP; ++IP) { - if (IP->getKind() != SDep::Anti) - continue; - DepsAdded.push_back(std::make_pair(SU, *IP)); - } + for (SUnit &SU : SUnits) { + for (SDep &Pred : SU.Preds) + if (Pred.getKind() == SDep::Anti) + DepsAdded.push_back(std::make_pair(&SU, Pred)); } for (std::pair<SUnit *, SDep> &P : DepsAdded) { // Remove this anti dependency and add one in the reverse direction. @@ -1471,27 +1465,23 @@ void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) { } // Compute ALAP, ZeroLatencyHeight, and MOV. - for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(), - E = Topo.rend(); - I != E; ++I) { + for (int I : llvm::reverse(Topo)) { int alap = maxASAP; int zeroLatencyHeight = 0; - SUnit *SU = &SUnits[*I]; - for (SUnit::const_succ_iterator IS = SU->Succs.begin(), - ES = SU->Succs.end(); - IS != ES; ++IS) { - SUnit *succ = IS->getSUnit(); - if (IS->getLatency() == 0) + SUnit *SU = &SUnits[I]; + for (const SDep &S : SU->Succs) { + SUnit *succ = S.getSUnit(); + if (S.getLatency() == 0) zeroLatencyHeight = std::max(zeroLatencyHeight, getZeroLatencyHeight(succ) + 1); - if (ignoreDependence(*IS, true)) + if (ignoreDependence(S, true)) continue; - alap = std::min(alap, (int)(getALAP(succ) - IS->getLatency() + - getDistance(SU, succ, *IS) * MII)); + alap = std::min(alap, (int)(getALAP(succ) - S.getLatency() + + getDistance(SU, succ, S) * MII)); } - ScheduleInfo[*I].ALAP = alap; - ScheduleInfo[*I].ZeroLatencyHeight = zeroLatencyHeight; + ScheduleInfo[I].ALAP = alap; + ScheduleInfo[I].ZeroLatencyHeight = zeroLatencyHeight; } // After computing the node functions, compute the summary for each node set. @@ -1548,9 +1538,8 @@ static bool succ_L(SetVector<SUnit *> &NodeOrder, SmallSetVector<SUnit *, 8> &Succs, const NodeSet *S = nullptr) { Succs.clear(); - for (SetVector<SUnit *>::iterator I = NodeOrder.begin(), E = NodeOrder.end(); - I != E; ++I) { - for (SDep &Succ : (*I)->Succs) { + for (const SUnit *SU : NodeOrder) { + for (const SDep &Succ : SU->Succs) { if (S && S->count(Succ.getSUnit()) == 0) continue; if (ignoreDependence(Succ, false)) @@ -1558,7 +1547,7 @@ static bool succ_L(SetVector<SUnit *> &NodeOrder, if (NodeOrder.count(Succ.getSUnit()) == 0) Succs.insert(Succ.getSUnit()); } - for (SDep &Pred : (*I)->Preds) { + for (const SDep &Pred : SU->Preds) { if (Pred.getKind() != SDep::Anti) continue; if (S && S->count(Pred.getSUnit()) == 0) @@ -2202,7 +2191,7 @@ bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI, MachineInstr *NewMI = MF.CloneMachineInstr(MI); NewMI->getOperand(OffsetPosLd).setImm(LoadOffset + StoreOffset); bool Disjoint = TII->areMemAccessesTriviallyDisjoint(*NewMI, *PrevDef); - MF.DeleteMachineInstr(NewMI); + MF.deleteMachineInstr(NewMI); if (!Disjoint) return false; @@ -2885,10 +2874,8 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) { ++stage) { std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[cycle + (stage * InitiationInterval)]; - for (std::deque<SUnit *>::reverse_iterator I = cycleInstrs.rbegin(), - E = cycleInstrs.rend(); - I != E; ++I) - ScheduledInstrs[cycle].push_front(*I); + for (SUnit *SU : llvm::reverse(cycleInstrs)) + ScheduledInstrs[cycle].push_front(SU); } } @@ -2899,10 +2886,8 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) { // Change the registers in instruction as specified in the InstrChanges // map. We need to use the new registers to create the correct order. - for (int i = 0, e = SSD->SUnits.size(); i != e; ++i) { - SUnit *SU = &SSD->SUnits[i]; - SSD->applyInstrChange(SU->getInstr(), *this); - } + for (const SUnit &SU : SSD->SUnits) + SSD->applyInstrChange(SU.getInstr(), *this); // Reorder the instructions in each cycle to fix and improve the // generated code. diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp new file mode 100644 index 000000000000..8db893535daf --- /dev/null +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -0,0 +1,52 @@ +//===- MachineSSAContext.cpp ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines a specialization of the GenericSSAContext<X> +/// template class for Machine IR. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineSSAContext.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +MachineBasicBlock *MachineSSAContext::getEntryBlock(MachineFunction &F) { + return &F.front(); +} + +void MachineSSAContext::setFunction(MachineFunction &Fn) { + MF = &Fn; + RegInfo = &MF->getRegInfo(); +} + +Printable MachineSSAContext::print(MachineBasicBlock *Block) const { + return Printable([Block](raw_ostream &Out) { Block->printName(Out); }); +} + +Printable MachineSSAContext::print(MachineInstr *I) const { + return Printable([I](raw_ostream &Out) { I->print(Out); }); +} + +Printable MachineSSAContext::print(Register Value) const { + auto *MRI = RegInfo; + return Printable([MRI, Value](raw_ostream &Out) { + Out << printReg(Value, MRI->getTargetRegisterInfo(), 0, MRI); + + if (Value) { + // Try to print the definition. + if (auto *Instr = MRI->getUniqueVRegDef(Value)) { + Out << ": "; + Instr->print(Out); + } + } + }); +} diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp index 930677e4fd7d..48076663ddf5 100644 --- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp +++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp @@ -126,7 +126,9 @@ MachineInstrBuilder InsertNewDef(unsigned Opcode, } /// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that -/// is live in the middle of the specified block. +/// is live in the middle of the specified block. If ExistingValueOnly is +/// true then this will only return an existing value or $noreg; otherwise new +/// instructions may be inserted to materialize a value. /// /// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one /// important case: if there is a definition of the rewritten value after the @@ -143,14 +145,18 @@ MachineInstrBuilder InsertNewDef(unsigned Opcode, /// their respective blocks. However, the use of X happens in the *middle* of /// a block. Because of this, we need to insert a new PHI node in SomeBB to /// merge the appropriate values, and this value isn't live out of the block. -Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) { +Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB, + bool ExistingValueOnly) { // If there is no definition of the renamed variable in this block, just use // GetValueAtEndOfBlock to do our work. if (!HasValueForBlock(BB)) - return GetValueAtEndOfBlockInternal(BB); + return GetValueAtEndOfBlockInternal(BB, ExistingValueOnly); // If there are no predecessors, just return undef. if (BB->pred_empty()) { + // If we cannot insert new instructions, just return $noreg. + if (ExistingValueOnly) + return Register(); // Insert an implicit_def to represent an undef value. MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF, BB, BB->getFirstTerminator(), @@ -165,7 +171,7 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) { bool isFirstPred = true; for (MachineBasicBlock *PredBB : BB->predecessors()) { - Register PredVal = GetValueAtEndOfBlockInternal(PredBB); + Register PredVal = GetValueAtEndOfBlockInternal(PredBB, ExistingValueOnly); PredValues.push_back(std::make_pair(PredBB, PredVal)); // Compute SingularValue. @@ -185,6 +191,10 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) { if (DupPHI) return DupPHI; + // If we cannot create new instructions, return $noreg now. + if (ExistingValueOnly) + return Register(); + // Otherwise, we do need a PHI: insert one now. MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin(); MachineInstrBuilder InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB, @@ -350,10 +360,13 @@ public: /// for the specified BB and if so, return it. If not, construct SSA form by /// first calculating the required placement of PHIs and then inserting new /// PHIs where needed. -Register MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){ +Register +MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB, + bool ExistingValueOnly) { AvailableValsTy &AvailableVals = getAvailableVals(AV); - if (Register V = AvailableVals[BB]) - return V; + Register ExistingVal = AvailableVals.lookup(BB); + if (ExistingVal || ExistingValueOnly) + return ExistingVal; SSAUpdaterImpl<MachineSSAUpdater> Impl(this, &AvailableVals, InsertedPHIs); return Impl.GetValue(BB); diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index 47d40f0823c8..b043d4c1b0c1 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -90,12 +90,17 @@ cl::opt<bool> VerifyScheduling( "verify-misched", cl::Hidden, cl::desc("Verify machine instrs before and after machine scheduling")); +#ifndef NDEBUG +cl::opt<bool> ViewMISchedDAGs( + "view-misched-dags", cl::Hidden, + cl::desc("Pop up a window to show MISched dags after they are processed")); +#else +const bool ViewMISchedDAGs = false; +#endif // NDEBUG + } // end namespace llvm #ifndef NDEBUG -static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden, - cl::desc("Pop up a window to show MISched dags after they are processed")); - /// In some situations a few uninteresting nodes depend on nearly all other /// nodes in the graph, provide a cutoff to hide them. static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden, @@ -111,7 +116,6 @@ static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden, static cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden, cl::desc("Print schedule DAGs")); #else -static const bool ViewMISchedDAGs = false; static const bool PrintDAGs = false; #endif // NDEBUG @@ -561,11 +565,10 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler, MBBRegionsVector MBBRegions; getSchedRegions(&*MBB, MBBRegions, Scheduler.doMBBSchedRegionsTopDown()); - for (MBBRegionsVector::iterator R = MBBRegions.begin(); - R != MBBRegions.end(); ++R) { - MachineBasicBlock::iterator I = R->RegionBegin; - MachineBasicBlock::iterator RegionEnd = R->RegionEnd; - unsigned NumRegionInstrs = R->NumRegionInstrs; + for (const SchedRegion &R : MBBRegions) { + MachineBasicBlock::iterator I = R.RegionBegin; + MachineBasicBlock::iterator RegionEnd = R.RegionEnd; + unsigned NumRegionInstrs = R.NumRegionInstrs; // Notify the scheduler of the region, even if we may skip scheduling // it. Perhaps it still needs to be bundled. diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp index 8df23b781ffd..0a5ff276fedc 100644 --- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp +++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp @@ -80,9 +80,9 @@ bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) { void MachineTraceMetrics::releaseMemory() { MF = nullptr; BlockInfo.clear(); - for (unsigned i = 0; i != TS_NumStrategies; ++i) { - delete Ensembles[i]; - Ensembles[i] = nullptr; + for (Ensemble *&E : Ensembles) { + delete E; + E = nullptr; } } @@ -398,9 +398,9 @@ void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB) << '\n'); BlockInfo[MBB->getNumber()].invalidate(); - for (unsigned i = 0; i != TS_NumStrategies; ++i) - if (Ensembles[i]) - Ensembles[i]->invalidate(MBB); + for (Ensemble *E : Ensembles) + if (E) + E->invalidate(MBB); } void MachineTraceMetrics::verifyAnalysis() const { @@ -408,9 +408,9 @@ void MachineTraceMetrics::verifyAnalysis() const { return; #ifndef NDEBUG assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size"); - for (unsigned i = 0; i != TS_NumStrategies; ++i) - if (Ensembles[i]) - Ensembles[i]->verify(); + for (Ensemble *E : Ensembles) + if (E) + E->verify(); #endif } @@ -984,8 +984,7 @@ addLiveIns(const MachineInstr *DefMI, unsigned DefOp, const MachineBasicBlock *DefMBB = DefMI->getParent(); // Reg is live-in to all blocks in Trace that follow DefMBB. - for (unsigned i = Trace.size(); i; --i) { - const MachineBasicBlock *MBB = Trace[i-1]; + for (const MachineBasicBlock *MBB : llvm::reverse(Trace)) { if (MBB == DefMBB) return; TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()]; @@ -1204,8 +1203,8 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const { for (unsigned K = 0; K != PRDepths.size(); ++K) PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]); } else { - for (unsigned K = 0; K != PRDepths.size(); ++K) - PRMax = std::max(PRMax, PRDepths[K]); + for (unsigned PRD : PRDepths) + PRMax = std::max(PRMax, PRD); } // Convert to cycle count. PRMax = TE.MTM.getCycles(PRMax); diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 32078db76cf3..005d4ad1a328 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -101,6 +101,7 @@ namespace { // Avoid querying the MachineFunctionProperties for each operand. bool isFunctionRegBankSelected; bool isFunctionSelected; + bool isFunctionTracksDebugUserValues; using RegVector = SmallVector<Register, 16>; using RegMaskVector = SmallVector<const uint32_t *, 4>; @@ -384,6 +385,8 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) { MachineFunctionProperties::Property::RegBankSelected); isFunctionSelected = MF.getProperties().hasProperty( MachineFunctionProperties::Property::Selected); + isFunctionTracksDebugUserValues = MF.getProperties().hasProperty( + MachineFunctionProperties::Property::TracksDebugUserValues); LiveVars = nullptr; LiveInts = nullptr; @@ -1605,12 +1608,16 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { } break; } + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: case TargetOpcode::G_ROTR: case TargetOpcode::G_ROTL: { LLT Src1Ty = MRI->getType(MI->getOperand(1).getReg()); LLT Src2Ty = MRI->getType(MI->getOperand(2).getReg()); if (Src1Ty.isVector() != Src2Ty.isVector()) { - report("Rotate requires operands to be either all scalars or all vectors", + report("Shifts and rotates require operands to be either all scalars or " + "all vectors", MI); break; } @@ -1980,41 +1987,50 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) { if (MO->isUndef()) report("Generic virtual register use cannot be undef", MO, MONum); - // If we're post-Select, we can't have gvregs anymore. - if (isFunctionSelected) { - report("Generic virtual register invalid in a Selected function", - MO, MONum); - return; - } + // Debug value instruction is permitted to use undefined vregs. + // This is a performance measure to skip the overhead of immediately + // pruning unused debug operands. The final undef substitution occurs + // when debug values are allocated in LDVImpl::handleDebugValue, so + // these verifications always apply after this pass. + if (isFunctionTracksDebugUserValues || !MO->isUse() || + !MI->isDebugValue() || !MRI->def_empty(Reg)) { + // If we're post-Select, we can't have gvregs anymore. + if (isFunctionSelected) { + report("Generic virtual register invalid in a Selected function", + MO, MONum); + return; + } - // The gvreg must have a type and it must not have a SubIdx. - LLT Ty = MRI->getType(Reg); - if (!Ty.isValid()) { - report("Generic virtual register must have a valid type", MO, - MONum); - return; - } + // The gvreg must have a type and it must not have a SubIdx. + LLT Ty = MRI->getType(Reg); + if (!Ty.isValid()) { + report("Generic virtual register must have a valid type", MO, + MONum); + return; + } - const RegisterBank *RegBank = MRI->getRegBankOrNull(Reg); + const RegisterBank *RegBank = MRI->getRegBankOrNull(Reg); - // If we're post-RegBankSelect, the gvreg must have a bank. - if (!RegBank && isFunctionRegBankSelected) { - report("Generic virtual register must have a bank in a " - "RegBankSelected function", - MO, MONum); - return; - } + // If we're post-RegBankSelect, the gvreg must have a bank. + if (!RegBank && isFunctionRegBankSelected) { + report("Generic virtual register must have a bank in a " + "RegBankSelected function", + MO, MONum); + return; + } - // Make sure the register fits into its register bank if any. - if (RegBank && Ty.isValid() && - RegBank->getSize() < Ty.getSizeInBits()) { - report("Register bank is too small for virtual register", MO, - MONum); - errs() << "Register bank " << RegBank->getName() << " too small(" - << RegBank->getSize() << ") to fit " << Ty.getSizeInBits() - << "-bits\n"; - return; + // Make sure the register fits into its register bank if any. + if (RegBank && Ty.isValid() && + RegBank->getSize() < Ty.getSizeInBits()) { + report("Register bank is too small for virtual register", MO, + MONum); + errs() << "Register bank " << RegBank->getName() << " too small(" + << RegBank->getSize() << ") to fit " << Ty.getSizeInBits() + << "-bits\n"; + return; + } } + if (SubIdx) { report("Generic virtual register does not allow subregister index", MO, MONum); @@ -2217,8 +2233,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) { if (LiveInts && Reg.isVirtual()) { if (LiveInts->hasInterval(Reg)) { LI = &LiveInts->getInterval(Reg); - if (SubRegIdx != 0 && !LI->empty() && !LI->hasSubRanges() && - MRI->shouldTrackSubRegLiveness(Reg)) + if (SubRegIdx != 0 && (MO->isDef() || !MO->isUndef()) && !LI->empty() && + !LI->hasSubRanges() && MRI->shouldTrackSubRegLiveness(Reg)) report("Live interval for subreg operand has no subranges", MO, MONum); } else { report("Virtual register has no live interval", MO, MONum); diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp index 77a6c37e1362..7693ab417de9 100644 --- a/llvm/lib/CodeGen/PHIElimination.cpp +++ b/llvm/lib/CodeGen/PHIElimination.cpp @@ -213,7 +213,7 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) { for (auto &I : LoweredPHIs) { if (LIS) LIS->RemoveMachineInstrFromMaps(*I.first); - MF.DeleteMachineInstr(I.first); + MF.deleteMachineInstr(I.first); } // TODO: we should use the incremental DomTree updater here. @@ -626,7 +626,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB, if (reusedIncoming || !IncomingReg) { if (LIS) LIS->RemoveMachineInstrFromMaps(*MPhi); - MF.DeleteMachineInstr(MPhi); + MF.deleteMachineInstr(MPhi); } } diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp index b85f00a61eac..d7cd0a583cee 100644 --- a/llvm/lib/CodeGen/PostRASchedulerList.cpp +++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp @@ -252,8 +252,8 @@ void SchedulePostRATDList::exitRegion() { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// dumpSchedule - dump the scheduled Sequence. LLVM_DUMP_METHOD void SchedulePostRATDList::dumpSchedule() const { - for (unsigned i = 0, e = Sequence.size(); i != e; i++) { - if (SUnit *SU = Sequence[i]) + for (const SUnit *SU : Sequence) { + if (SU) dumpNode(*SU); else dbgs() << "**** NOOP ****\n"; @@ -531,11 +531,11 @@ void SchedulePostRATDList::ListScheduleTopDown() { ReleaseSuccessors(&EntrySU); // Add all leaves to Available queue. - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { + for (SUnit &SUnit : SUnits) { // It is available if it has no predecessors. - if (!SUnits[i].NumPredsLeft && !SUnits[i].isAvailable) { - AvailableQueue.push(&SUnits[i]); - SUnits[i].isAvailable = true; + if (!SUnit.NumPredsLeft && !SUnit.isAvailable) { + AvailableQueue.push(&SUnit); + SUnit.isAvailable = true; } } @@ -657,10 +657,7 @@ void SchedulePostRATDList::ListScheduleTopDown() { #ifndef NDEBUG unsigned ScheduledNodes = VerifyScheduledDAG(/*isBottomUp=*/false); - unsigned Noops = 0; - for (unsigned i = 0, e = Sequence.size(); i != e; ++i) - if (!Sequence[i]) - ++Noops; + unsigned Noops = llvm::count(Sequence, nullptr); assert(Sequence.size() - Noops == ScheduledNodes && "The number of nodes scheduled doesn't match the expected number!"); #endif // NDEBUG diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 29a88480fd9f..8d8a6126dad0 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -953,12 +953,22 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) { // LocalStackSlotPass didn't already allocate a slot for it. // If we are told to use the LocalStackAllocationBlock, the stack protector // is expected to be already pre-allocated. - if (!MFI.getUseLocalStackAllocationBlock()) + if (MFI.getStackID(StackProtectorFI) != TargetStackID::Default) { + // If the stack protector isn't on the default stack then it's up to the + // target to set the stack offset. + assert(MFI.getObjectOffset(StackProtectorFI) != 0 && + "Offset of stack protector on non-default stack expected to be " + "already set."); + assert(!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex()) && + "Stack protector on non-default stack expected to not be " + "pre-allocated by LocalStackSlotPass."); + } else if (!MFI.getUseLocalStackAllocationBlock()) { AdjustStackOffset(MFI, StackProtectorFI, StackGrowsDown, Offset, MaxAlign, Skew); - else if (!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex())) + } else if (!MFI.isObjectPreAllocated(MFI.getStackProtectorIndex())) { llvm_unreachable( "Stack protector not pre-allocated by LocalStackSlotPass."); + } // Assign large stack objects first. for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp index f605068e076d..882f8e91bf1d 100644 --- a/llvm/lib/CodeGen/RDFGraph.cpp +++ b/llvm/lib/CodeGen/RDFGraph.cpp @@ -1500,8 +1500,8 @@ void DataFlowGraph::buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs, // Erase from MaxRefs all elements in the closure. auto Begin = MaxRefs.begin(); - for (unsigned i = ClosureIdx.size(); i != 0; --i) - MaxRefs.erase(Begin + ClosureIdx[i-1]); + for (unsigned Idx : llvm::reverse(ClosureIdx)) + MaxRefs.erase(Begin + Idx); } } diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp new file mode 100644 index 000000000000..9f1012c95964 --- /dev/null +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp @@ -0,0 +1,121 @@ +//===- RegAllocEvictionAdvisor.cpp - eviction advisor ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of the default eviction advisor and of the Analysis pass. +// +//===----------------------------------------------------------------------===// + +#include "RegAllocEvictionAdvisor.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +static cl::opt<RegAllocEvictionAdvisorAnalysis::AdvisorMode> Mode( + "regalloc-enable-advisor", cl::Hidden, + cl::init(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default), + cl::desc("Enable regalloc advisor mode"), + cl::values( + clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default, + "default", "Default"), + clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release, + "release", "precompiled"), + clEnumValN(RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development, + "development", "for training"))); + +static cl::opt<bool> EnableLocalReassignment( + "enable-local-reassign", cl::Hidden, + cl::desc("Local reassignment can yield better allocation decisions, but " + "may be compile time intensive"), + cl::init(false)); + +#define DEBUG_TYPE "regalloc" + +char RegAllocEvictionAdvisorAnalysis::ID = 0; +INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysis, "regalloc-evict", + "Regalloc eviction policy", false, true) + +namespace { +class DefaultEvictionAdvisorAnalysis final + : public RegAllocEvictionAdvisorAnalysis { +public: + DefaultEvictionAdvisorAnalysis(bool NotAsRequested) + : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Default), + NotAsRequested(NotAsRequested) {} + + // support for isa<> and dyn_cast. + static bool classof(const RegAllocEvictionAdvisorAnalysis *R) { + return R->getAdvisorMode() == AdvisorMode::Default; + } + +private: + std::unique_ptr<RegAllocEvictionAdvisor> + getAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix, + LiveIntervals *LIS, VirtRegMap *VRM, + const RegisterClassInfo &RegClassInfo, + ExtraRegInfo *ExtraInfo) override { + return std::make_unique<DefaultEvictionAdvisor>(MF, Matrix, LIS, VRM, + RegClassInfo, ExtraInfo); + } + bool doInitialization(Module &M) override { + if (NotAsRequested) + M.getContext().emitError("Requested regalloc eviction advisor analysis " + "could be created. Using default"); + return RegAllocEvictionAdvisorAnalysis::doInitialization(M); + } + const bool NotAsRequested; +}; +} // namespace + +template <> Pass *llvm::callDefaultCtor<RegAllocEvictionAdvisorAnalysis>() { + Pass *Ret = nullptr; + switch (Mode) { + case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Default: + Ret = new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ false); + break; + case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development: + // TODO(mtrofin): add implementation + break; + case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release: + // TODO(mtrofin): add implementation + break; + } + if (Ret) + return Ret; + return new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ true); +} + +StringRef RegAllocEvictionAdvisorAnalysis::getPassName() const { + switch (getAdvisorMode()) { + case AdvisorMode::Default: + return "Default Regalloc Eviction Advisor"; + case AdvisorMode::Release: + return "Release mode Regalloc Eviction Advisor"; + case AdvisorMode::Development: + return "Development mode Regalloc Eviction Advisor"; + } + llvm_unreachable("Unknown advisor kind"); +} + +RegAllocEvictionAdvisor::RegAllocEvictionAdvisor( + const MachineFunction &MF, LiveRegMatrix *Matrix, LiveIntervals *LIS, + VirtRegMap *VRM, const RegisterClassInfo &RegClassInfo, + ExtraRegInfo *ExtraInfo) + : MF(MF), Matrix(Matrix), LIS(LIS), VRM(VRM), MRI(&VRM->getRegInfo()), + TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RegClassInfo), + RegCosts(TRI->getRegisterCosts(MF)), ExtraInfo(ExtraInfo), + EnableLocalReassign(EnableLocalReassignment || + MF.getSubtarget().enableRALocalReassignment( + MF.getTarget().getOptLevel())) {} diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h index 85fd3207888b..debb75ed5020 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h @@ -18,6 +18,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Config/llvm-config.h" #include "llvm/Pass.h" namespace llvm { @@ -85,6 +86,215 @@ struct EvictionCost { std::tie(O.BrokenHints, O.MaxWeight); } }; + +/// Track allocation stage and eviction loop prevention during allocation. +// TODO(mtrofin): Consider exposing RAGreedy in a header instead, and folding +// this back into it. +class ExtraRegInfo final { + // RegInfo - Keep additional information about each live range. + struct RegInfo { + LiveRangeStage Stage = RS_New; + + // Cascade - Eviction loop prevention. See + // canEvictInterferenceBasedOnCost(). + unsigned Cascade = 0; + + RegInfo() = default; + }; + + IndexedMap<RegInfo, VirtReg2IndexFunctor> Info; + unsigned NextCascade = 1; + +public: + ExtraRegInfo() = default; + ExtraRegInfo(const ExtraRegInfo &) = delete; + + LiveRangeStage getStage(Register Reg) const { return Info[Reg].Stage; } + + LiveRangeStage getStage(const LiveInterval &VirtReg) const { + return getStage(VirtReg.reg()); + } + + void setStage(Register Reg, LiveRangeStage Stage) { + Info.grow(Reg.id()); + Info[Reg].Stage = Stage; + } + + void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) { + setStage(VirtReg.reg(), Stage); + } + + /// Return the current stage of the register, if present, otherwise initialize + /// it and return that. + LiveRangeStage getOrInitStage(Register Reg) { + Info.grow(Reg.id()); + return getStage(Reg); + } + + unsigned getCascade(Register Reg) const { return Info[Reg].Cascade; } + + void setCascade(Register Reg, unsigned Cascade) { + Info.grow(Reg.id()); + Info[Reg].Cascade = Cascade; + } + + unsigned getOrAssignNewCascade(Register Reg) { + unsigned Cascade = getCascade(Reg); + if (!Cascade) { + Cascade = NextCascade++; + setCascade(Reg, Cascade); + } + return Cascade; + } + + unsigned getCascadeOrCurrentNext(Register Reg) const { + unsigned Cascade = getCascade(Reg); + if (!Cascade) + Cascade = NextCascade; + return Cascade; + } + + template <typename Iterator> + void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) { + for (; Begin != End; ++Begin) { + Register Reg = *Begin; + Info.grow(Reg.id()); + if (Info[Reg].Stage == RS_New) + Info[Reg].Stage = NewStage; + } + } + void LRE_DidCloneVirtReg(Register New, Register Old); +}; + +/// Interface to the eviction advisor, which is responsible for making a +/// decision as to which live ranges should be evicted (if any). +class RegAllocEvictionAdvisor { +public: + RegAllocEvictionAdvisor(const RegAllocEvictionAdvisor &) = delete; + RegAllocEvictionAdvisor(RegAllocEvictionAdvisor &&) = delete; + virtual ~RegAllocEvictionAdvisor() = default; + + /// Find a physical register that can be freed by evicting the FixedRegisters, + /// or return NoRegister. The eviction decision is assumed to be correct (i.e. + /// no fixed live ranges are evicted) and profitable. + virtual MCRegister + tryFindEvictionCandidate(LiveInterval &VirtReg, const AllocationOrder &Order, + uint8_t CostPerUseLimit, + const SmallVirtRegSet &FixedRegisters) const = 0; + + /// Find out if we can evict the live ranges occupying the given PhysReg, + /// which is a hint (preferred register) for VirtReg. + virtual bool + canEvictHintInterference(LiveInterval &VirtReg, MCRegister PhysReg, + const SmallVirtRegSet &FixedRegisters) const = 0; + + /// Returns true if the given \p PhysReg is a callee saved register and has + /// not been used for allocation yet. + bool isUnusedCalleeSavedReg(MCRegister PhysReg) const; + +protected: + RegAllocEvictionAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix, + LiveIntervals *LIS, VirtRegMap *VRM, + const RegisterClassInfo &RegClassInfo, + ExtraRegInfo *ExtraInfo); + + Register canReassign(LiveInterval &VirtReg, Register PrevReg) const; + + const MachineFunction &MF; + LiveRegMatrix *const Matrix; + LiveIntervals *const LIS; + VirtRegMap *const VRM; + MachineRegisterInfo *const MRI; + const TargetRegisterInfo *const TRI; + const RegisterClassInfo &RegClassInfo; + const ArrayRef<uint8_t> RegCosts; + ExtraRegInfo *const ExtraInfo; + + /// Run or not the local reassignment heuristic. This information is + /// obtained from the TargetSubtargetInfo. + const bool EnableLocalReassign; + +private: + unsigned NextCascade = 1; +}; + +/// ImmutableAnalysis abstraction for fetching the Eviction Advisor. We model it +/// as an analysis to decouple the user from the implementation insofar as +/// dependencies on other analyses goes. The motivation for it being an +/// immutable pass is twofold: +/// - in the ML implementation case, the evaluator is stateless but (especially +/// in the development mode) expensive to set up. With an immutable pass, we set +/// it up once. +/// - in the 'development' mode ML case, we want to capture the training log +/// during allocation (this is a log of features encountered and decisions +/// made), and then measure a score, potentially a few steps after allocation +/// completes. So we need the properties of an immutable pass to keep the logger +/// state around until we can make that measurement. +/// +/// Because we need to offer additional services in 'development' mode, the +/// implementations of this analysis need to implement RTTI support. +class RegAllocEvictionAdvisorAnalysis : public ImmutablePass { +public: + enum class AdvisorMode : int { Default, Release, Development }; + + RegAllocEvictionAdvisorAnalysis(AdvisorMode Mode) + : ImmutablePass(ID), Mode(Mode){}; + static char ID; + + /// Get an advisor for the given context (i.e. machine function, etc) + virtual std::unique_ptr<RegAllocEvictionAdvisor> + getAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix, + LiveIntervals *LIS, VirtRegMap *VRM, + const RegisterClassInfo &RegClassInfo, + ExtraRegInfo *ExtraInfo) = 0; + AdvisorMode getAdvisorMode() const { return Mode; } + +private: + // This analysis preserves everything, and subclasses may have additional + // requirements. + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + StringRef getPassName() const override; + const AdvisorMode Mode; +}; + +/// Specialization for the API used by the analysis infrastructure to create +/// an instance of the eviction advisor. +template <> Pass *callDefaultCtor<RegAllocEvictionAdvisorAnalysis>(); + +// TODO(mtrofin): implement these. +#ifdef LLVM_HAVE_TF_AOT +RegAllocEvictionAdvisorAnalysis *createReleaseModeAdvisor(); +#endif + +#ifdef LLVM_HAVE_TF_API +RegAllocEvictionAdvisorAnalysis *createDevelopmentModeAdvisor(); +#endif + +// TODO: move to RegAllocEvictionAdvisor.cpp when we move implementation +// out of RegAllocGreedy.cpp +class DefaultEvictionAdvisor : public RegAllocEvictionAdvisor { +public: + DefaultEvictionAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix, + LiveIntervals *LIS, VirtRegMap *VRM, + const RegisterClassInfo &RegClassInfo, + ExtraRegInfo *ExtraInfo) + : RegAllocEvictionAdvisor(MF, Matrix, LIS, VRM, RegClassInfo, ExtraInfo) { + } + +private: + MCRegister tryFindEvictionCandidate(LiveInterval &, const AllocationOrder &, + uint8_t, + const SmallVirtRegSet &) const override; + bool canEvictHintInterference(LiveInterval &, MCRegister, + const SmallVirtRegSet &) const override; + bool canEvictInterferenceBasedOnCost(LiveInterval &, MCRegister, bool, + EvictionCost &, + const SmallVirtRegSet &) const; + bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const; +}; } // namespace llvm #endif // LLVM_CODEGEN_REGALLOCEVICTIONADVISOR_H diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index 50411c177007..ce3cf31dbd6b 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -112,12 +112,6 @@ static cl::opt<bool> ExhaustiveSearch( "and interference cutoffs of last chance recoloring"), cl::Hidden); -static cl::opt<bool> EnableLocalReassignment( - "enable-local-reassign", cl::Hidden, - cl::desc("Local reassignment can yield better allocation decisions, but " - "may be compile time intensive"), - cl::init(false)); - static cl::opt<bool> EnableDeferredSpilling( "enable-deferred-spilling", cl::Hidden, cl::desc("Instead of spilling a variable right away, defer the actual " @@ -172,8 +166,9 @@ class RAGreedy : public MachineFunctionPass, // state std::unique_ptr<Spiller> SpillerInstance; PQueue Queue; - unsigned NextCascade; std::unique_ptr<VirtRegAuxInfo> VRAI; + Optional<ExtraRegInfo> ExtraInfo; + std::unique_ptr<RegAllocEvictionAdvisor> EvictAdvisor; // Enum CutOffStage to keep a track whether the register allocation failed // because of the cutoffs encountered in last chance recoloring. @@ -195,76 +190,6 @@ class RAGreedy : public MachineFunctionPass, static const char *const StageName[]; #endif - // RegInfo - Keep additional information about each live range. - struct RegInfo { - LiveRangeStage Stage = RS_New; - - // Cascade - Eviction loop prevention. See - // canEvictInterferenceBasedOnCost(). - unsigned Cascade = 0; - - RegInfo() = default; - }; - - IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo; - - LiveRangeStage getStage(Register Reg) const { - return ExtraRegInfo[Reg].Stage; - } - - LiveRangeStage getStage(const LiveInterval &VirtReg) const { - return getStage(VirtReg.reg()); - } - - void setStage(Register Reg, LiveRangeStage Stage) { - ExtraRegInfo.resize(MRI->getNumVirtRegs()); - ExtraRegInfo[Reg].Stage = Stage; - } - - void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) { - setStage(VirtReg.reg(), Stage); - } - - /// Return the current stage of the register, if present, otherwise initialize - /// it and return that. - LiveRangeStage getOrInitStage(Register Reg) { - ExtraRegInfo.grow(Reg); - return getStage(Reg); - } - - unsigned getCascade(Register Reg) const { return ExtraRegInfo[Reg].Cascade; } - - void setCascade(Register Reg, unsigned Cascade) { - ExtraRegInfo.resize(MRI->getNumVirtRegs()); - ExtraRegInfo[Reg].Cascade = Cascade; - } - - unsigned getOrAssignNewCascade(Register Reg) { - unsigned Cascade = getCascade(Reg); - if (!Cascade) { - Cascade = NextCascade++; - setCascade(Reg, Cascade); - } - return Cascade; - } - - unsigned getCascadeOrCurrentNext(Register Reg) const { - unsigned Cascade = getCascade(Reg); - if (!Cascade) - Cascade = NextCascade; - return Cascade; - } - - template<typename Iterator> - void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) { - ExtraRegInfo.resize(MRI->getNumVirtRegs()); - for (;Begin != End; ++Begin) { - Register Reg = *Begin; - if (ExtraRegInfo[Reg].Stage == RS_New) - ExtraRegInfo[Reg].Stage = NewStage; - } - } - /// EvictionTrack - Keeps track of past evictions in order to optimize region /// split decision. class EvictionTrack { @@ -375,10 +300,6 @@ class RAGreedy : public MachineFunctionPass, /// Callee-save register cost, calculated once per machine function. BlockFrequency CSRCost; - /// Run or not the local reassignment heuristic. This information is - /// obtained from the TargetSubtargetInfo. - bool EnableLocalReassign; - /// Enable or not the consideration of the cost of local intervals created /// by a split candidate when choosing the best split candidate. bool EnableAdvancedRASplitCost; @@ -447,13 +368,6 @@ private: bool calcCompactRegion(GlobalSplitCandidate&); void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>); void calcGapWeights(MCRegister, SmallVectorImpl<float> &); - Register canReassign(LiveInterval &VirtReg, Register PrevReg) const; - bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool) const; - bool canEvictInterferenceBasedOnCost(LiveInterval &, MCRegister, bool, - EvictionCost &, - const SmallVirtRegSet &) const; - bool canEvictHintInterference(LiveInterval &, MCRegister, - const SmallVirtRegSet &) const; bool canEvictInterferenceInRange(const LiveInterval &VirtReg, MCRegister PhysReg, SlotIndex Start, SlotIndex End, EvictionCost &MaxCost) const; @@ -529,8 +443,6 @@ private: BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister); void collectHintInfo(Register, HintsInfo &); - bool isUnusedCalleeSavedReg(MCRegister PhysReg) const; - /// Greedy RA statistic to remark. struct RAGreedyStats { unsigned Reloads = 0; @@ -597,6 +509,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) INITIALIZE_PASS_DEPENDENCY(EdgeBundles) INITIALIZE_PASS_DEPENDENCY(SpillPlacement) INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass) +INITIALIZE_PASS_DEPENDENCY(RegAllocEvictionAdvisorAnalysis) INITIALIZE_PASS_END(RAGreedy, "greedy", "Greedy Register Allocator", false, false) @@ -663,6 +576,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<EdgeBundles>(); AU.addRequired<SpillPlacement>(); AU.addRequired<MachineOptimizationRemarkEmitterPass>(); + AU.addRequired<RegAllocEvictionAdvisorAnalysis>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -696,22 +610,25 @@ void RAGreedy::LRE_WillShrinkVirtReg(Register VirtReg) { } void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) { + ExtraInfo->LRE_DidCloneVirtReg(New, Old); +} + +void ExtraRegInfo::LRE_DidCloneVirtReg(Register New, Register Old) { // Cloning a register we haven't even heard about yet? Just ignore it. - if (!ExtraRegInfo.inBounds(Old)) + if (!Info.inBounds(Old)) return; // LRE may clone a virtual register because dead code elimination causes it to // be split into connected components. The new components are much smaller // than the original, so they should get a new chance at being assigned. // same stage as the parent. - ExtraRegInfo[Old].Stage = RS_Assign; - ExtraRegInfo.grow(New); - ExtraRegInfo[New] = ExtraRegInfo[Old]; + Info[Old].Stage = RS_Assign; + Info.grow(New.id()); + Info[New] = Info[Old]; } void RAGreedy::releaseMemory() { SpillerInstance.reset(); - ExtraRegInfo.clear(); GlobalCand.clear(); } @@ -725,10 +642,10 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) { assert(Reg.isVirtual() && "Can only enqueue virtual registers"); unsigned Prio; - auto Stage = getOrInitStage(Reg); + auto Stage = ExtraInfo->getOrInitStage(Reg); if (Stage == RS_New) { Stage = RS_Assign; - setStage(Reg, Stage); + ExtraInfo->setStage(Reg, Stage); } if (Stage == RS_Split) { // Unsplit ranges that couldn't be allocated immediately are deferred until @@ -824,7 +741,8 @@ MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg, MCRegister PhysHint = Hint.asMCReg(); LLVM_DEBUG(dbgs() << "missed hint " << printReg(PhysHint, TRI) << '\n'); - if (canEvictHintInterference(VirtReg, PhysHint, FixedRegisters)) { + if (EvictAdvisor->canEvictHintInterference(VirtReg, PhysHint, + FixedRegisters)) { evictInterference(VirtReg, PhysHint, NewVRegs); return PhysHint; } @@ -850,7 +768,8 @@ MCRegister RAGreedy::tryAssign(LiveInterval &VirtReg, // Interference eviction //===----------------------------------------------------------------------===// -Register RAGreedy::canReassign(LiveInterval &VirtReg, Register PrevReg) const { +Register RegAllocEvictionAdvisor::canReassign(LiveInterval &VirtReg, + Register PrevReg) const { auto Order = AllocationOrder::create(VirtReg.reg(), *VRM, RegClassInfo, Matrix); MCRegister PhysReg; @@ -889,9 +808,10 @@ Register RAGreedy::canReassign(LiveInterval &VirtReg, Register PrevReg) const { /// register. /// @param B The live range to be evicted. /// @param BreaksHint True when B is already assigned to its preferred register. -bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint, - LiveInterval &B, bool BreaksHint) const { - bool CanSplit = getStage(B) < RS_Spill; +bool DefaultEvictionAdvisor::shouldEvict(LiveInterval &A, bool IsHint, + LiveInterval &B, + bool BreaksHint) const { + bool CanSplit = ExtraInfo->getStage(B) < RS_Spill; // Be fairly aggressive about following hints as long as the evictee can be // split. @@ -907,7 +827,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint, /// canEvictHintInterference - return true if the interference for VirtReg /// on the PhysReg, which is VirtReg's hint, can be evicted in favor of VirtReg. -bool RAGreedy::canEvictHintInterference( +bool DefaultEvictionAdvisor::canEvictHintInterference( LiveInterval &VirtReg, MCRegister PhysReg, const SmallVirtRegSet &FixedRegisters) const { EvictionCost MaxCost; @@ -925,7 +845,7 @@ bool RAGreedy::canEvictHintInterference( /// @param MaxCost Only look for cheaper candidates and update with new cost /// when returning true. /// @returns True when interference can be evicted cheaper than MaxCost. -bool RAGreedy::canEvictInterferenceBasedOnCost( +bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost( LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint, EvictionCost &MaxCost, const SmallVirtRegSet &FixedRegisters) const { // It is only possible to evict virtual register interference. @@ -941,9 +861,7 @@ bool RAGreedy::canEvictInterferenceBasedOnCost( // // This works out so a register without a cascade number is allowed to evict // anything, and it can be evicted by anything. - unsigned Cascade = ExtraRegInfo[VirtReg.reg()].Cascade; - if (!Cascade) - Cascade = NextCascade; + unsigned Cascade = ExtraInfo->getCascadeOrCurrentNext(VirtReg.reg()); EvictionCost Cost; for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) { @@ -965,7 +883,7 @@ bool RAGreedy::canEvictInterferenceBasedOnCost( return false; // Never evict spill products. They cannot split or spill. - if (getStage(*Intf) == RS_Done) + if (ExtraInfo->getStage(*Intf) == RS_Done) return false; // Once a live range becomes small enough, it is urgent that we find a // register for it. This is indicated by an infinite spill weight. These @@ -980,7 +898,7 @@ bool RAGreedy::canEvictInterferenceBasedOnCost( RegClassInfo.getNumAllocatableRegs( MRI->getRegClass(Intf->reg()))); // Only evict older cascades or live ranges without a cascade. - unsigned IntfCascade = ExtraRegInfo[Intf->reg()].Cascade; + unsigned IntfCascade = ExtraInfo->getCascade(Intf->reg()); if (Cascade <= IntfCascade) { if (!Urgent) return false; @@ -1043,7 +961,7 @@ bool RAGreedy::canEvictInterferenceInRange(const LiveInterval &VirtReg, if (!Register::isVirtualRegister(Intf->reg())) return false; // Never evict spill products. They cannot split or spill. - if (getStage(*Intf) == RS_Done) + if (ExtraInfo->getStage(*Intf) == RS_Done) return false; // Would this break a satisfied hint? @@ -1106,7 +1024,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg, // Make sure that VirtReg has a cascade number, and assign that cascade // number to every evicted register. These live ranges than then only be // evicted by a newer cascade, preventing infinite loops. - unsigned Cascade = getOrAssignNewCascade(VirtReg.reg()); + unsigned Cascade = ExtraInfo->getOrAssignNewCascade(VirtReg.reg()); LLVM_DEBUG(dbgs() << "evicting " << printReg(PhysReg, TRI) << " interference: Cascade " << Cascade << '\n'); @@ -1132,10 +1050,10 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg, LastEvicted.addEviction(PhysReg, VirtReg.reg(), Intf->reg()); Matrix->unassign(*Intf); - assert((getCascade(Intf->reg()) < Cascade || + assert((ExtraInfo->getCascade(Intf->reg()) < Cascade || VirtReg.isSpillable() < Intf->isSpillable()) && "Cannot decrease cascade number, illegal eviction"); - setCascade(Intf->reg(), Cascade); + ExtraInfo->setCascade(Intf->reg(), Cascade); ++NumEvicted; NewVRegs.push_back(Intf->reg()); } @@ -1143,7 +1061,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, MCRegister PhysReg, /// Returns true if the given \p PhysReg is a callee saved register and has not /// been used for allocation yet. -bool RAGreedy::isUnusedCalleeSavedReg(MCRegister PhysReg) const { +bool RegAllocEvictionAdvisor::isUnusedCalleeSavedReg(MCRegister PhysReg) const { MCRegister CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg); if (!CSR) return false; @@ -1151,7 +1069,7 @@ bool RAGreedy::isUnusedCalleeSavedReg(MCRegister PhysReg) const { return !Matrix->isPhysRegUsed(PhysReg); } -MCRegister RAGreedy::tryFindEvictionCandidate( +MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate( LiveInterval &VirtReg, const AllocationOrder &Order, uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const { // Keep track of the cheapest interference seen so far. @@ -1225,8 +1143,8 @@ MCRegister RAGreedy::tryEvict(LiveInterval &VirtReg, AllocationOrder &Order, NamedRegionTimer T("evict", "Evict", TimerGroupName, TimerGroupDescription, TimePassesIsEnabled); - MCRegister BestPhys = - tryFindEvictionCandidate(VirtReg, Order, CostPerUseLimit, FixedRegisters); + MCRegister BestPhys = EvictAdvisor->tryFindEvictionCandidate( + VirtReg, Order, CostPerUseLimit, FixedRegisters); if (BestPhys.isValid()) evictInterference(VirtReg, BestPhys, NewVRegs); return BestPhys; @@ -1769,8 +1687,8 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, // the ActiveBlocks list with each candidate. We need to filter out // duplicates. BitVector Todo = SA->getThroughBlocks(); - for (unsigned c = 0; c != UsedCands.size(); ++c) { - ArrayRef<unsigned> Blocks = GlobalCand[UsedCands[c]].ActiveBlocks; + for (unsigned UsedCand : UsedCands) { + ArrayRef<unsigned> Blocks = GlobalCand[UsedCand].ActiveBlocks; for (unsigned Number : Blocks) { if (!Todo.test(Number)) continue; @@ -1817,13 +1735,13 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, const LiveInterval &Reg = LIS->getInterval(LREdit.get(I)); // Ignore old intervals from DCE. - if (getOrInitStage(Reg.reg()) != RS_New) + if (ExtraInfo->getOrInitStage(Reg.reg()) != RS_New) continue; // Remainder interval. Don't try splitting again, spill if it doesn't // allocate. if (IntvMap[I] == 0) { - setStage(Reg, RS_Spill); + ExtraInfo->setStage(Reg, RS_Spill); continue; } @@ -1834,7 +1752,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit, LLVM_DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks << " blocks as original.\n"); // Don't allow repeated splitting as a safe guard against looping. - setStage(Reg, RS_Split2); + ExtraInfo->setStage(Reg, RS_Split2); } continue; } @@ -1899,7 +1817,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg, unsigned BestCand = NoCand; for (MCPhysReg PhysReg : Order) { assert(PhysReg); - if (IgnoreCSR && isUnusedCalleeSavedReg(PhysReg)) + if (IgnoreCSR && EvictAdvisor->isUnusedCalleeSavedReg(PhysReg)) continue; // Discard bad candidates before we run out of interference cache cursors. @@ -2065,8 +1983,8 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order, // goes straight to spilling, the new local ranges get to stay RS_New. for (unsigned I = 0, E = LREdit.size(); I != E; ++I) { const LiveInterval &LI = LIS->getInterval(LREdit.get(I)); - if (getOrInitStage(LI.reg()) == RS_New && IntvMap[I] == 0) - setStage(LI, RS_Spill); + if (ExtraInfo->getOrInitStage(LI.reg()) == RS_New && IntvMap[I] == 0) + ExtraInfo->setStage(LI, RS_Spill); } if (VerifyEnabled) @@ -2152,7 +2070,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order, SE->finish(&IntvMap); DebugVars->splitRegister(VirtReg.reg(), LREdit.regs(), *LIS); // Assign all new registers to RS_Spill. This was the last chance. - setStage(LREdit.begin(), LREdit.end(), RS_Spill); + ExtraInfo->setStage(LREdit.begin(), LREdit.end(), RS_Spill); return 0; } @@ -2320,7 +2238,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, // These rules allow a 3 -> 2+3 split once, which we need. They also prevent // excessive splitting and infinite loops. // - bool ProgressRequired = getStage(VirtReg) >= RS_Split2; + bool ProgressRequired = ExtraInfo->getStage(VirtReg) >= RS_Split2; // Best split candidate. unsigned BestBefore = NumGaps; @@ -2456,7 +2374,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order, assert(!ProgressRequired && "Didn't make progress when it was required."); for (unsigned I = 0, E = IntvMap.size(); I != E; ++I) if (IntvMap[I] == 1) { - setStage(LIS->getInterval(LREdit.get(I)), RS_Split2); + ExtraInfo->setStage(LIS->getInterval(LREdit.get(I)), RS_Split2); LLVM_DEBUG(dbgs() << ' ' << printReg(LREdit.get(I))); } LLVM_DEBUG(dbgs() << '\n'); @@ -2477,7 +2395,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order, SmallVectorImpl<Register> &NewVRegs, const SmallVirtRegSet &FixedRegisters) { // Ranges must be Split2 or less. - if (getStage(VirtReg) >= RS_Spill) + if (ExtraInfo->getStage(VirtReg) >= RS_Spill) return 0; // Local intervals are handled separately. @@ -2499,7 +2417,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order, // First try to split around a region spanning multiple blocks. RS_Split2 // ranges already made dubious progress with region splitting, so they go // straight to single block splitting. - if (getStage(VirtReg) < RS_Split2) { + if (ExtraInfo->getStage(VirtReg) < RS_Split2) { MCRegister PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs); if (PhysReg || !NewVRegs.empty()) return PhysReg; @@ -2551,7 +2469,7 @@ bool RAGreedy::mayRecolorAllInterferences( // it would not be recolorable as it is in the same state as VirtReg. // However, if VirtReg has tied defs and Intf doesn't, then // there is still a point in examining if it can be recolorable. - if (((getStage(*Intf) == RS_Done && + if (((ExtraInfo->getStage(*Intf) == RS_Done && MRI->getRegClass(Intf->reg()) == CurRC) && !(hasTiedDef(MRI, VirtReg.reg()) && !hasTiedDef(MRI, Intf->reg()))) || @@ -2615,7 +2533,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg, LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n'); // Ranges must be Done. - assert((getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) && + assert((ExtraInfo->getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) && "Last chance recoloring should really be last chance"); // Set the max depth to LastChanceRecoloringMaxDepth. // We may want to reconsider that if we end up with a too large search space @@ -2806,7 +2724,7 @@ MCRegister RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order, MCRegister PhysReg, uint8_t &CostPerUseLimit, SmallVectorImpl<Register> &NewVRegs) { - if (getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) { + if (ExtraInfo->getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) { // We choose spill over using the CSR for the first time if the spill cost // is lower than CSRCost. SA->analyze(&VirtReg); @@ -2818,7 +2736,7 @@ RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order, CostPerUseLimit = 1; return 0; } - if (getStage(VirtReg) < RS_Split) { + if (ExtraInfo->getStage(VirtReg) < RS_Split) { // We choose pre-splitting over using the CSR for the first time if // the cost of splitting is lower than CSRCost. SA->analyze(&VirtReg); @@ -3051,8 +2969,8 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, // When NewVRegs is not empty, we may have made decisions such as evicting // a virtual register, go with the earlier decisions and use the physical // register. - if (CSRCost.getFrequency() && isUnusedCalleeSavedReg(PhysReg) && - NewVRegs.empty()) { + if (CSRCost.getFrequency() && + EvictAdvisor->isUnusedCalleeSavedReg(PhysReg) && NewVRegs.empty()) { MCRegister CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg, CostPerUseLimit, NewVRegs); if (CSRReg || !NewVRegs.empty()) @@ -3063,9 +2981,9 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, return PhysReg; } - LiveRangeStage Stage = getStage(VirtReg); + LiveRangeStage Stage = ExtraInfo->getStage(VirtReg); LLVM_DEBUG(dbgs() << StageName[Stage] << " Cascade " - << getCascade(VirtReg.reg()) << '\n'); + << ExtraInfo->getCascade(VirtReg.reg()) << '\n'); // Try to evict a less worthy live range, but only for ranges from the primary // queue. The RS_Split ranges already failed to do this, and they should not @@ -3094,7 +3012,7 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, // Wait until the second time, when all smaller ranges have been allocated. // This gives a better picture of the interference to split around. if (Stage < RS_Split) { - setStage(VirtReg, RS_Split); + ExtraInfo->setStage(VirtReg, RS_Split); LLVM_DEBUG(dbgs() << "wait for second round\n"); NewVRegs.push_back(VirtReg.reg()); return 0; @@ -3120,12 +3038,12 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, // Finally spill VirtReg itself. if ((EnableDeferredSpilling || TRI->shouldUseDeferredSpillingForVirtReg(*MF, VirtReg)) && - getStage(VirtReg) < RS_Memory) { + ExtraInfo->getStage(VirtReg) < RS_Memory) { // TODO: This is experimental and in particular, we do not model // the live range splitting done by spilling correctly. // We would need a deep integration with the spiller to do the // right thing here. Anyway, that is still good for early testing. - setStage(VirtReg, RS_Memory); + ExtraInfo->setStage(VirtReg, RS_Memory); LLVM_DEBUG(dbgs() << "Do as if this register is in memory\n"); NewVRegs.push_back(VirtReg.reg()); } else { @@ -3133,7 +3051,7 @@ MCRegister RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg, TimerGroupDescription, TimePassesIsEnabled); LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats); spiller().spill(LRE); - setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done); + ExtraInfo->setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done); // Tell LiveDebugVariables about the new ranges. Ranges not being covered by // the new regs are kept in LDV (still mapping to the old register), until @@ -3316,10 +3234,6 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { TII = MF->getSubtarget().getInstrInfo(); RCI.runOnMachineFunction(mf); - EnableLocalReassign = EnableLocalReassignment || - MF->getSubtarget().enableRALocalReassignment( - MF->getTarget().getOptLevel()); - EnableAdvancedRASplitCost = ConsiderLocalIntervalCost.getNumOccurrences() ? ConsiderLocalIntervalCost @@ -3354,8 +3268,9 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) { SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops)); SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, *VRAI)); - ExtraRegInfo.clear(); - NextCascade = 1; + ExtraInfo.emplace(); + EvictAdvisor = getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor( + *MF, Matrix, LIS, VRM, RegClassInfo, &*ExtraInfo); IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI); GlobalCand.resize(32); // This will grow as needed. SetOfBrokenHints.clear(); diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp index b22eb080791e..93be8f689d57 100644 --- a/llvm/lib/CodeGen/RegAllocPBQP.cpp +++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp @@ -623,8 +623,8 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, // Compute an initial allowed set for the current vreg. std::vector<MCRegister> VRegAllowed; ArrayRef<MCPhysReg> RawPRegOrder = TRC->getRawAllocationOrder(MF); - for (unsigned I = 0; I != RawPRegOrder.size(); ++I) { - MCRegister PReg(RawPRegOrder[I]); + for (MCPhysReg R : RawPRegOrder) { + MCRegister PReg(R); if (MRI.isReserved(PReg)) continue; diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp new file mode 100644 index 000000000000..740890831617 --- /dev/null +++ b/llvm/lib/CodeGen/RegAllocScore.cpp @@ -0,0 +1,124 @@ +//===- RegAllocScore.cpp - evaluate regalloc policy quality ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// Calculate a measure of the register allocation policy quality. This is used +/// to construct a reward for the training of the ML-driven allocation policy. +/// Currently, the score is the sum of the machine basic block frequency-weighed +/// number of loads, stores, copies, and remat instructions, each factored with +/// a relative weight. +//===----------------------------------------------------------------------===// + +#include "RegAllocScore.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include <cassert> +#include <cstdint> +#include <numeric> +#include <vector> + +using namespace llvm; +cl::opt<double> CopyWeight("regalloc-copy-weight", cl::init(0.2), cl::Hidden); +cl::opt<double> LoadWeight("regalloc-load-weight", cl::init(4.0), cl::Hidden); +cl::opt<double> StoreWeight("regalloc-store-weight", cl::init(1.0), cl::Hidden); +cl::opt<double> CheapRematWeight("regalloc-cheap-remat-weight", cl::init(0.2), + cl::Hidden); +cl::opt<double> ExpensiveRematWeight("regalloc-expensive-remat-weight", + cl::init(1.0), cl::Hidden); +#define DEBUG_TYPE "regalloc-score" + +RegAllocScore &RegAllocScore::operator+=(const RegAllocScore &Other) { + CopyCounts += Other.copyCounts(); + LoadCounts += Other.loadCounts(); + StoreCounts += Other.storeCounts(); + LoadStoreCounts += Other.loadStoreCounts(); + CheapRematCounts += Other.cheapRematCounts(); + ExpensiveRematCounts += Other.expensiveRematCounts(); + return *this; +} + +bool RegAllocScore::operator==(const RegAllocScore &Other) const { + return copyCounts() == Other.copyCounts() && + loadCounts() == Other.loadCounts() && + storeCounts() == Other.storeCounts() && + loadStoreCounts() == Other.loadStoreCounts() && + cheapRematCounts() == Other.cheapRematCounts() && + expensiveRematCounts() == Other.expensiveRematCounts(); +} + +bool RegAllocScore::operator!=(const RegAllocScore &Other) const { + return !(*this == Other); +} + +double RegAllocScore::getScore() const { + double Ret = 0.0; + Ret += CopyWeight * copyCounts(); + Ret += LoadWeight * loadCounts(); + Ret += StoreWeight * storeCounts(); + Ret += (LoadWeight + StoreWeight) * loadStoreCounts(); + Ret += CheapRematWeight * cheapRematCounts(); + Ret += ExpensiveRematWeight * expensiveRematCounts(); + + return Ret; +} + +RegAllocScore +llvm::calculateRegAllocScore(const MachineFunction &MF, + const MachineBlockFrequencyInfo &MBFI, + AAResults &AAResults) { + return calculateRegAllocScore( + MF, + [&](const MachineBasicBlock &MBB) { + return MBFI.getBlockFreqRelativeToEntryBlock(&MBB); + }, + [&](const MachineInstr &MI) { + return MF.getSubtarget().getInstrInfo()->isTriviallyReMaterializable( + MI, &AAResults); + }); +} + +RegAllocScore llvm::calculateRegAllocScore( + const MachineFunction &MF, + llvm::function_ref<double(const MachineBasicBlock &)> GetBBFreq, + llvm::function_ref<bool(const MachineInstr &)> + IsTriviallyRematerializable) { + RegAllocScore Total; + + for (const MachineBasicBlock &MBB : MF) { + double BlockFreqRelativeToEntrypoint = GetBBFreq(MBB); + RegAllocScore MBBScore; + + for (const MachineInstr &MI : MBB) { + if (MI.isDebugInstr() || MI.isKill() || MI.isInlineAsm()) { + continue; + } + if (MI.isCopy()) { + MBBScore.onCopy(BlockFreqRelativeToEntrypoint); + } else if (IsTriviallyRematerializable(MI)) { + if (MI.getDesc().isAsCheapAsAMove()) { + MBBScore.onCheapRemat(BlockFreqRelativeToEntrypoint); + } else { + MBBScore.onExpensiveRemat(BlockFreqRelativeToEntrypoint); + } + } else if (MI.mayLoad() && MI.mayStore()) { + MBBScore.onLoadStore(BlockFreqRelativeToEntrypoint); + } else if (MI.mayLoad()) { + MBBScore.onLoad(BlockFreqRelativeToEntrypoint); + } else if (MI.mayStore()) { + MBBScore.onStore(BlockFreqRelativeToEntrypoint); + } + } + Total += MBBScore; + } + return Total; +} diff --git a/llvm/lib/CodeGen/RegAllocScore.h b/llvm/lib/CodeGen/RegAllocScore.h new file mode 100644 index 000000000000..3c28bb61189d --- /dev/null +++ b/llvm/lib/CodeGen/RegAllocScore.h @@ -0,0 +1,80 @@ +//==- RegAllocScore.h - evaluate regalloc policy quality ----------*-C++-*-==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// Calculate a measure of the register allocation policy quality. This is used +/// to construct a reward for the training of the ML-driven allocation policy. +/// Currently, the score is the sum of the machine basic block frequency-weighed +/// number of loads, stores, copies, and remat instructions, each factored with +/// a relative weight. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_REGALLOCSCORE_H_ +#define LLVM_CODEGEN_REGALLOCSCORE_H_ + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/Utils/TFUtils.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/IR/Module.h" +#include <cassert> +#include <cstdint> +#include <limits> + +namespace llvm { + +/// Regalloc score. +class RegAllocScore final { + double CopyCounts = 0.0; + double LoadCounts = 0.0; + double StoreCounts = 0.0; + double CheapRematCounts = 0.0; + double LoadStoreCounts = 0.0; + double ExpensiveRematCounts = 0.0; + +public: + RegAllocScore() = default; + RegAllocScore(const RegAllocScore &) = default; + + double copyCounts() const { return CopyCounts; } + double loadCounts() const { return LoadCounts; } + double storeCounts() const { return StoreCounts; } + double loadStoreCounts() const { return LoadStoreCounts; } + double expensiveRematCounts() const { return ExpensiveRematCounts; } + double cheapRematCounts() const { return CheapRematCounts; } + + void onCopy(double Freq) { CopyCounts += Freq; } + void onLoad(double Freq) { LoadCounts += Freq; } + void onStore(double Freq) { StoreCounts += Freq; } + void onLoadStore(double Freq) { LoadStoreCounts += Freq; } + void onExpensiveRemat(double Freq) { ExpensiveRematCounts += Freq; } + void onCheapRemat(double Freq) { CheapRematCounts += Freq; } + + RegAllocScore &operator+=(const RegAllocScore &Other); + bool operator==(const RegAllocScore &Other) const; + bool operator!=(const RegAllocScore &Other) const; + double getScore() const; +}; + +/// Calculate a score. When comparing 2 scores for the same function but +/// different policies, the better policy would have a smaller score. +/// The implementation is the overload below (which is also easily unittestable) +RegAllocScore calculateRegAllocScore(const MachineFunction &MF, + const MachineBlockFrequencyInfo &MBFI, + AAResults &AAResults); + +/// Implementation of the above, which is also more easily unittestable. +RegAllocScore calculateRegAllocScore( + const MachineFunction &MF, + llvm::function_ref<double(const MachineBasicBlock &)> GetBBFreq, + llvm::function_ref<bool(const MachineInstr &)> IsTriviallyRematerializable); +} // end namespace llvm + +#endif // LLVM_CODEGEN_REGALLOCSCORE_H_ diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp index 797899fb5b86..65a65b9cae95 100644 --- a/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -109,8 +109,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { // FIXME: Once targets reserve registers instead of removing them from the // allocation order, we can simply use begin/end here. ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF); - for (unsigned i = 0; i != RawOrder.size(); ++i) { - unsigned PhysReg = RawOrder[i]; + for (unsigned PhysReg : RawOrder) { // Remove reserved registers from the allocation order. if (Reserved.test(PhysReg)) continue; diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 4c8534cf2d01..a917b0d27d4a 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -4067,13 +4067,13 @@ void RegisterCoalescer::joinAllIntervals() { // Coalesce intervals in MBB priority order. unsigned CurrDepth = std::numeric_limits<unsigned>::max(); - for (unsigned i = 0, e = MBBs.size(); i != e; ++i) { + for (MBBPriorityInfo &MBB : MBBs) { // Try coalescing the collected local copies for deeper loops. - if (JoinGlobalCopies && MBBs[i].Depth < CurrDepth) { + if (JoinGlobalCopies && MBB.Depth < CurrDepth) { coalesceLocals(); - CurrDepth = MBBs[i].Depth; + CurrDepth = MBB.Depth; } - copyCoalesceInMBB(MBBs[i].MBB); + copyCoalesceInMBB(MBB.MBB); } lateLiveIntervalUpdate(); coalesceLocals(); diff --git a/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp b/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp index de6129a912d3..49859aeec78b 100644 --- a/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp +++ b/llvm/lib/CodeGen/RemoveRedundantDebugValues.cpp @@ -159,20 +159,17 @@ static bool reduceDbgValsBackwardScan(MachineBasicBlock &MBB) { SmallVector<MachineInstr *, 8> DbgValsToBeRemoved; SmallDenseSet<DebugVariable> VariableSet; - for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend(); - I != E; ++I) { - MachineInstr *MI = &*I; - - if (MI->isDebugValue()) { - DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(), - MI->getDebugLoc()->getInlinedAt()); + for (MachineInstr &MI : llvm::reverse(MBB)) { + if (MI.isDebugValue()) { + DebugVariable Var(MI.getDebugVariable(), MI.getDebugExpression(), + MI.getDebugLoc()->getInlinedAt()); auto R = VariableSet.insert(Var); // If it is a DBG_VALUE describing a constant as: // DBG_VALUE 0, ... // we just don't consider such instructions as candidates // for redundant removal. - if (MI->isNonListDebugValue()) { - MachineOperand &Loc = MI->getDebugOperand(0); + if (MI.isNonListDebugValue()) { + MachineOperand &Loc = MI.getDebugOperand(0); if (!Loc.isReg()) { // If we have already encountered this variable, just stop // tracking it. @@ -185,7 +182,7 @@ static bool reduceDbgValsBackwardScan(MachineBasicBlock &MBB) { // We have already encountered the value for this variable, // so this one can be deleted. if (!R.second) - DbgValsToBeRemoved.push_back(MI); + DbgValsToBeRemoved.push_back(&MI); continue; } diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp index 50d9d64bfcfd..3d8a7eecce18 100644 --- a/llvm/lib/CodeGen/SafeStack.cpp +++ b/llvm/lib/CodeGen/SafeStack.cpp @@ -521,8 +521,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( StackLayout SSL(StackAlignment); if (StackGuardSlot) { Type *Ty = StackGuardSlot->getAllocatedType(); - uint64_t Align = - std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); + Align Align = std::max(DL.getPrefTypeAlign(Ty), StackGuardSlot->getAlign()); SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot), Align, SSC.getFullLiveRange()); } @@ -534,8 +533,9 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( Size = 1; // Don't create zero-sized stack objects. // Ensure the object is properly aligned. - uint64_t Align = - std::max(DL.getPrefTypeAlignment(Ty), Arg->getParamAlignment()); + Align Align = DL.getPrefTypeAlign(Ty); + if (auto A = Arg->getParamAlign()) + Align = std::max(Align, *A); SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange()); } @@ -546,24 +546,24 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( Size = 1; // Don't create zero-sized stack objects. // Ensure the object is properly aligned. - uint64_t Align = std::max(DL.getPrefTypeAlignment(Ty), AI->getAlignment()); + Align Align = std::max(DL.getPrefTypeAlign(Ty), AI->getAlign()); SSL.addObject(AI, Size, Align, ClColoring ? SSC.getLiveRange(AI) : NoColoringRange); } SSL.computeLayout(); - uint64_t FrameAlignment = SSL.getFrameAlignment(); + Align FrameAlignment = SSL.getFrameAlignment(); // FIXME: tell SSL that we start at a less-then-MaxAlignment aligned location // (AlignmentSkew). if (FrameAlignment > StackAlignment) { // Re-align the base pointer according to the max requested alignment. - assert(isPowerOf2_64(FrameAlignment)); IRB.SetInsertPoint(BasePointer->getNextNode()); BasePointer = cast<Instruction>(IRB.CreateIntToPtr( - IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy), - ConstantInt::get(IntPtrTy, ~uint64_t(FrameAlignment - 1))), + IRB.CreateAnd( + IRB.CreatePtrToInt(BasePointer, IntPtrTy), + ConstantInt::get(IntPtrTy, ~(FrameAlignment.value() - 1))), StackPtrTy)); } diff --git a/llvm/lib/CodeGen/SafeStackLayout.cpp b/llvm/lib/CodeGen/SafeStackLayout.cpp index 7cdda7743c16..602afcfa9001 100644 --- a/llvm/lib/CodeGen/SafeStackLayout.cpp +++ b/llvm/lib/CodeGen/SafeStackLayout.cpp @@ -37,7 +37,7 @@ LLVM_DUMP_METHOD void StackLayout::print(raw_ostream &OS) { } } -void StackLayout::addObject(const Value *V, unsigned Size, uint64_t Alignment, +void StackLayout::addObject(const Value *V, unsigned Size, Align Alignment, const StackLifetime::LiveRange &Range) { StackObjects.push_back({V, Size, Alignment, Range}); ObjectAlignments[V] = Alignment; @@ -45,7 +45,7 @@ void StackLayout::addObject(const Value *V, unsigned Size, uint64_t Alignment, } static unsigned AdjustStackOffset(unsigned Offset, unsigned Size, - uint64_t Alignment) { + Align Alignment) { return alignTo(Offset + Size, Alignment) - Size; } @@ -62,7 +62,8 @@ void StackLayout::layoutObject(StackObject &Obj) { } LLVM_DEBUG(dbgs() << "Layout: size " << Obj.Size << ", align " - << Obj.Alignment << ", range " << Obj.Range << "\n"); + << Obj.Alignment.value() << ", range " << Obj.Range + << "\n"); assert(Obj.Alignment <= MaxAlignment); unsigned Start = AdjustStackOffset(0, Obj.Size, Obj.Alignment); unsigned End = Start + Obj.Size; diff --git a/llvm/lib/CodeGen/SafeStackLayout.h b/llvm/lib/CodeGen/SafeStackLayout.h index b72450e57080..4ac7af2059f5 100644 --- a/llvm/lib/CodeGen/SafeStackLayout.h +++ b/llvm/lib/CodeGen/SafeStackLayout.h @@ -22,7 +22,7 @@ namespace safestack { /// Compute the layout of an unsafe stack frame. class StackLayout { - uint64_t MaxAlignment; + Align MaxAlignment; struct StackRegion { unsigned Start; @@ -40,14 +40,14 @@ class StackLayout { struct StackObject { const Value *Handle; unsigned Size; - uint64_t Alignment; + Align Alignment; StackLifetime::LiveRange Range; }; SmallVector<StackObject, 8> StackObjects; DenseMap<const Value *, unsigned> ObjectOffsets; - DenseMap<const Value *, uint64_t> ObjectAlignments; + DenseMap<const Value *, Align> ObjectAlignments; void layoutObject(StackObject &Obj); @@ -56,7 +56,7 @@ public: /// Add an object to the stack frame. Value pointer is opaque and used as a /// handle to retrieve the object's offset in the frame later. - void addObject(const Value *V, unsigned Size, uint64_t Alignment, + void addObject(const Value *V, unsigned Size, Align Alignment, const StackLifetime::LiveRange &Range); /// Run the layout computation for all previously added objects. @@ -66,13 +66,13 @@ public: unsigned getObjectOffset(const Value *V) { return ObjectOffsets[V]; } /// Returns the alignment of the object - uint64_t getObjectAlignment(const Value *V) { return ObjectAlignments[V]; } + Align getObjectAlignment(const Value *V) { return ObjectAlignments[V]; } /// Returns the size of the entire frame. unsigned getFrameSize() { return Regions.empty() ? 0 : Regions.back().End; } /// Returns the alignment of the frame. - uint64_t getFrameAlignment() { return MaxAlignment; } + Align getFrameAlignment() { return MaxAlignment; } void print(raw_ostream &OS); }; diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp index ef3afab2b730..696b29018ae6 100644 --- a/llvm/lib/CodeGen/ScheduleDAG.cpp +++ b/llvm/lib/CodeGen/ScheduleDAG.cpp @@ -618,8 +618,8 @@ std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU, do { const SUnit *SU = WorkList.back(); WorkList.pop_back(); - for (int I = SU->Succs.size()-1; I >= 0; --I) { - const SUnit *Succ = SU->Succs[I].getSUnit(); + for (const SDep &SD : llvm::reverse(SU->Succs)) { + const SUnit *Succ = SD.getSUnit(); unsigned s = Succ->NodeNum; // Edges to non-SUnits are allowed but ignored (e.g. ExitSU). if (Succ->isBoundaryNode()) @@ -652,8 +652,8 @@ std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU, do { const SUnit *SU = WorkList.back(); WorkList.pop_back(); - for (int I = SU->Preds.size()-1; I >= 0; --I) { - const SUnit *Pred = SU->Preds[I].getSUnit(); + for (const SDep &SD : llvm::reverse(SU->Preds)) { + const SUnit *Pred = SD.getSUnit(); unsigned s = Pred->NodeNum; // Edges to non-SUnits are allowed but ignored (e.g. EntrySU). if (Pred->isBoundaryNode()) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index df5a041b87cd..067ad819e0d2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -487,10 +487,7 @@ namespace { SDValue visitFCEIL(SDNode *N); SDValue visitFTRUNC(SDNode *N); SDValue visitFFLOOR(SDNode *N); - SDValue visitFMINNUM(SDNode *N); - SDValue visitFMAXNUM(SDNode *N); - SDValue visitFMINIMUM(SDNode *N); - SDValue visitFMAXIMUM(SDNode *N); + SDValue visitFMinMax(SDNode *N); SDValue visitBRCOND(SDNode *N); SDValue visitBR_CC(SDNode *N); SDValue visitLOAD(SDNode *N); @@ -1701,10 +1698,10 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::FNEG: return visitFNEG(N); case ISD::FABS: return visitFABS(N); case ISD::FFLOOR: return visitFFLOOR(N); - case ISD::FMINNUM: return visitFMINNUM(N); - case ISD::FMAXNUM: return visitFMAXNUM(N); - case ISD::FMINIMUM: return visitFMINIMUM(N); - case ISD::FMAXIMUM: return visitFMAXIMUM(N); + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FMINIMUM: + case ISD::FMAXIMUM: return visitFMinMax(N); case ISD::FCEIL: return visitFCEIL(N); case ISD::FTRUNC: return visitFTRUNC(N); case ISD::BRCOND: return visitBRCOND(N); @@ -2260,6 +2257,21 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { EVT VT = N0.getValueType(); SDLoc DL(N); + // fold (add x, undef) -> undef + if (N0.isUndef()) + return N0; + if (N1.isUndef()) + return N1; + + // fold (add c1, c2) -> c1+c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1})) + return C; + + // canonicalize constant to RHS + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(ISD::ADD, DL, VT, N1, N0); + // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) @@ -2268,23 +2280,6 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { // fold (add x, 0) -> x, vector edition if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) return N0; - if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) - return N1; - } - - // fold (add x, undef) -> undef - if (N0.isUndef()) - return N0; - - if (N1.isUndef()) - return N1; - - if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { - // canonicalize constant to RHS - if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(ISD::ADD, DL, VT, N1, N0); - // fold (add c1, c2) -> c1+c2 - return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1}); } // fold (add x, 0) -> x @@ -2554,6 +2549,19 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) { EVT VT = N0.getValueType(); SDLoc DL(N); + // fold (add_sat x, undef) -> -1 + if (N0.isUndef() || N1.isUndef()) + return DAG.getAllOnesConstant(DL, VT); + + // fold (add_sat c1, c2) -> c3 + if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) + return C; + + // canonicalize constant to RHS + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(Opcode, DL, VT, N1, N0); + // fold vector ops if (VT.isVector()) { // TODO SimplifyVBinOp @@ -2561,20 +2569,6 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) { // fold (add_sat x, 0) -> x, vector edition if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) return N0; - if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) - return N1; - } - - // fold (add_sat x, undef) -> -1 - if (N0.isUndef() || N1.isUndef()) - return DAG.getAllOnesConstant(DL, VT); - - if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { - // canonicalize constant to RHS - if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(Opcode, DL, VT, N1, N0); - // fold (add_sat c1, c2) -> c3 - return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}); } // fold (add_sat x, 0) -> x @@ -3260,6 +3254,15 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { EVT VT = N0.getValueType(); SDLoc DL(N); + // fold (sub x, x) -> 0 + // FIXME: Refactor this and xor and other similar operations together. + if (N0 == N1) + return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); + + // fold (sub c1, c2) -> c3 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1})) + return C; + // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) @@ -3270,15 +3273,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { return N0; } - // fold (sub x, x) -> 0 - // FIXME: Refactor this and xor and other similar operations together. - if (N0 == N1) - return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations); - - // fold (sub c1, c2) -> c3 - if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1})) - return C; - if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -3611,15 +3605,6 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) { EVT VT = N0.getValueType(); SDLoc DL(N); - // fold vector ops - if (VT.isVector()) { - // TODO SimplifyVBinOp - - // fold (sub_sat x, 0) -> x, vector edition - if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) - return N0; - } - // fold (sub_sat x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); @@ -3632,6 +3617,15 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) { if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1})) return C; + // fold vector ops + if (VT.isVector()) { + // TODO SimplifyVBinOp + + // fold (sub_sat x, 0) -> x, vector edition + if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) + return N0; + } + // fold (sub_sat x, 0) -> x if (isNullConstant(N1)) return N0; @@ -3781,6 +3775,15 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); + // fold (mul c1, c2) -> c1*c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1})) + return C; + + // canonicalize constant to RHS (vector doesn't have to splat) + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); + bool N1IsConst = false; bool N1IsOpaqueConst = false; APInt ConstValue1; @@ -3802,15 +3805,6 @@ SDValue DAGCombiner::visitMUL(SDNode *N) { } } - // fold (mul c1, c2) -> c1*c2 - if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1})) - return C; - - // canonicalize constant to RHS (vector doesn't have to splat) - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); - // fold (mul x, 0) -> 0 if (N1IsConst && ConstValue1.isZero()) return N1; @@ -4140,17 +4134,17 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) { EVT CCVT = getSetCCResultType(VT); SDLoc DL(N); + // fold (sdiv c1, c2) -> c1/c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1})) + return C; + // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) return FoldedVOp; - // fold (sdiv c1, c2) -> c1/c2 - ConstantSDNode *N1C = isConstOrConstSplat(N1); - if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1})) - return C; - // fold (sdiv X, -1) -> 0-X + ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N1C && N1C->isAllOnes()) return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); @@ -4284,17 +4278,17 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) { EVT CCVT = getSetCCResultType(VT); SDLoc DL(N); + // fold (udiv c1, c2) -> c1/c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1})) + return C; + // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) return FoldedVOp; - // fold (udiv c1, c2) -> c1/c2 - ConstantSDNode *N1C = isConstOrConstSplat(N1); - if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1})) - return C; - // fold (udiv X, -1) -> select(X == -1, 1, 0) + ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N1C && N1C->isAllOnes()) return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), DAG.getConstant(1, DL, VT), @@ -4463,6 +4457,15 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); + // fold (mulhs c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1})) + return C; + + // canonicalize constant to RHS. + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0); + if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) return FoldedVOp; @@ -4474,15 +4477,6 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) { return DAG.getConstant(0, DL, VT); } - // fold (mulhs c1, c2) - if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1})) - return C; - - // canonicalize constant to RHS. - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0); - // fold (mulhs x, 0) -> 0 if (isNullConstant(N1)) return N1; @@ -4523,6 +4517,15 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { EVT VT = N->getValueType(0); SDLoc DL(N); + // fold (mulhu c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1})) + return C; + + // canonicalize constant to RHS. + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0); + if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) return FoldedVOp; @@ -4534,15 +4537,6 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { return DAG.getConstant(0, DL, VT); } - // fold (mulhu c1, c2) - if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1})) - return C; - - // canonicalize constant to RHS. - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0); - // fold (mulhu x, 0) -> 0 if (isNullConstant(N1)) return N1; @@ -4786,12 +4780,14 @@ SDValue DAGCombiner::visitMULO(SDNode *N) { } // Function to calculate whether the Min/Max pair of SDNodes (potentially -// swapped around) make a signed saturate pattern, clamping to between -2^(BW-1) -// and 2^(BW-1)-1. Returns the node being clamped and the bitwidth of the clamp -// in BW. Should work with both SMIN/SMAX nodes and setcc/select combo. The -// operands are the same as SimplifySelectCC. N0<N1 ? N2 : N3 +// swapped around) make a signed saturate pattern, clamping to between a signed +// saturate of -2^(BW-1) and 2^(BW-1)-1, or an unsigned saturate of 0 and 2^BW. +// Returns the node being clamped and the bitwidth of the clamp in BW. Should +// work with both SMIN/SMAX nodes and setcc/select combo. The operands are the +// same as SimplifySelectCC. N0<N1 ? N2 : N3. static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2, - SDValue N3, ISD::CondCode CC, unsigned &BW) { + SDValue N3, ISD::CondCode CC, unsigned &BW, + bool &Unsigned) { auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC) { // The compare and select operand should be the same or the select operands @@ -4858,17 +4854,27 @@ static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2, const APInt &MinC = MinCOp->getAPIntValue(); const APInt &MaxC = MaxCOp->getAPIntValue(); APInt MinCPlus1 = MinC + 1; - if (-MaxC != MinCPlus1 || !MinCPlus1.isPowerOf2()) - return SDValue(); - BW = MinCPlus1.exactLogBase2() + 1; - return N02; + if (-MaxC == MinCPlus1 && MinCPlus1.isPowerOf2()) { + BW = MinCPlus1.exactLogBase2() + 1; + Unsigned = false; + return N02; + } + + if (MaxC == 0 && MinCPlus1.isPowerOf2()) { + BW = MinCPlus1.exactLogBase2(); + Unsigned = true; + return N02; + } + + return SDValue(); } static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, SelectionDAG &DAG) { unsigned BW; - SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW); + bool Unsigned; + SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned); if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT) return SDValue(); EVT FPVT = Fp.getOperand(0).getValueType(); @@ -4876,13 +4882,14 @@ static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2, if (FPVT.isVector()) NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT, FPVT.getVectorElementCount()); - if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat( - ISD::FP_TO_SINT_SAT, Fp.getOperand(0).getValueType(), NewVT)) + unsigned NewOpc = Unsigned ? ISD::FP_TO_UINT_SAT : ISD::FP_TO_SINT_SAT; + if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(NewOpc, FPVT, NewVT)) return SDValue(); SDLoc DL(Fp); - SDValue Sat = DAG.getNode(ISD::FP_TO_SINT_SAT, DL, NewVT, Fp.getOperand(0), + SDValue Sat = DAG.getNode(NewOpc, DL, NewVT, Fp.getOperand(0), DAG.getValueType(NewVT.getScalarType())); - return DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0)); + return Unsigned ? DAG.getZExtOrTrunc(Sat, DL, N2->getValueType(0)) + : DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0)); } SDValue DAGCombiner::visitIMINMAX(SDNode *N) { @@ -4892,11 +4899,6 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { unsigned Opcode = N->getOpcode(); SDLoc DL(N); - // fold vector ops - if (VT.isVector()) - if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) - return FoldedVOp; - // fold operation with constant operands. if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) return C; @@ -4904,7 +4906,12 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) { // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(N->getOpcode(), DL, VT, N1, N0); + return DAG.getNode(Opcode, DL, VT, N1, N0); + + // fold vector ops + if (VT.isVector()) + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX. // Only do this if the current op isn't legal and the flipped is. @@ -5777,6 +5784,15 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (N0 == N1) return N0; + // fold (and c1, c2) -> c1&c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1})) + return C; + + // canonicalize constant to RHS + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); + // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N))) @@ -5824,22 +5840,13 @@ SDValue DAGCombiner::visitAND(SDNode *N) { } } - // fold (and c1, c2) -> c1&c2 - ConstantSDNode *N1C = isConstOrConstSplat(N1); - if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1})) - return C; - - // canonicalize constant to RHS - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); - // fold (and x, -1) -> x if (isAllOnesConstant(N1)) return N0; // if (and x, c) is known to be zero, return 0 unsigned BitWidth = VT.getScalarSizeInBits(); + ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth))) return DAG.getConstant(0, SDLoc(N), VT); @@ -6546,21 +6553,25 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (N0 == N1) return N0; + // fold (or c1, c2) -> c1|c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1})) + return C; + + // canonicalize constant to RHS + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0); + // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N))) return FoldedVOp; // fold (or x, 0) -> x, vector edition - if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) - return N1; if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) return N0; // fold (or x, -1) -> -1, vector edition - if (ISD::isConstantSplatVectorAllOnes(N0.getNode())) - // do not return N0, because undef node may exist in N0 - return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); if (ISD::isConstantSplatVectorAllOnes(N1.getNode())) // do not return N1, because undef node may exist in N1 return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); @@ -6629,16 +6640,6 @@ SDValue DAGCombiner::visitOR(SDNode *N) { } } - // fold (or c1, c2) -> c1|c2 - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); - if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1})) - return C; - - // canonicalize constant to RHS - if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) - return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0); - // fold (or x, 0) -> x if (isNullConstant(N1)) return N0; @@ -6651,6 +6652,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { return NewSel; // fold (or x, c) -> c iff (x & ~c) == 0 + ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) return N1; @@ -7941,18 +7943,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { EVT VT = N0.getValueType(); SDLoc DL(N); - // fold vector ops - if (VT.isVector()) { - if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) - return FoldedVOp; - - // fold (xor x, 0) -> x, vector edition - if (ISD::isConstantSplatVectorAllZeros(N0.getNode())) - return N1; - if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) - return N0; - } - // fold (xor undef, undef) -> 0. This is a common idiom (misuse). if (N0.isUndef() && N1.isUndef()) return DAG.getConstant(0, DL, VT); @@ -7969,9 +7959,19 @@ SDValue DAGCombiner::visitXOR(SDNode *N) { // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && - !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::XOR, DL, VT, N1, N0); + // fold vector ops + if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + + // fold (xor x, 0) -> x, vector edition + if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) + return N0; + } + // fold (xor x, 0) -> x if (isNullConstant(N1)) return N0; @@ -8409,6 +8409,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { EVT ShiftVT = N1.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); + // fold (shl c1, c2) -> c1<<c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1})) + return C; + // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N))) @@ -8434,12 +8438,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { } } - ConstantSDNode *N1C = isConstOrConstSplat(N1); - - // fold (shl c1, c2) -> c1<<c2 - if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1})) - return C; - if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -8558,6 +8556,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 // TODO - support non-uniform vector shift amounts. + ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && N0->getFlags().hasExact()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { @@ -8758,6 +8757,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); + // fold (sra c1, c2) -> (sra c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1})) + return C; + // Arithmetic shifting an all-sign-bit value is a no-op. // fold (sra 0, x) -> 0 // fold (sra -1, x) -> -1 @@ -8769,17 +8772,12 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N))) return FoldedVOp; - ConstantSDNode *N1C = isConstOrConstSplat(N1); - - // fold (sra c1, c2) -> (sra c1, c2) - if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1})) - return C; - if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports // sext_inreg. + ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits); @@ -8962,21 +8960,20 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); + // fold (srl c1, c2) -> c1 >>u c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1})) + return C; + // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N))) return FoldedVOp; - ConstantSDNode *N1C = isConstOrConstSplat(N1); - - // fold (srl c1, c2) -> c1 >>u c2 - if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1})) - return C; - if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // if (srl x, c) is known to be zero, return 0 + ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); @@ -10043,6 +10040,8 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); SDValue Mask = MST->getMask(); SDValue Chain = MST->getChain(); + SDValue Value = MST->getValue(); + SDValue Ptr = MST->getBasePtr(); SDLoc DL(N); // Zap masked stores with a zero mask. @@ -10054,12 +10053,50 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) { if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() && !MST->isCompressingStore() && !MST->isTruncatingStore()) return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(), - MST->getBasePtr(), MST->getMemOperand()); + MST->getBasePtr(), MST->getPointerInfo(), + MST->getOriginalAlign(), MachineMemOperand::MOStore, + MST->getAAInfo()); // Try transforming N to an indexed store. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); + if (MST->isTruncatingStore() && MST->isUnindexed() && + Value.getValueType().isInteger() && + (!isa<ConstantSDNode>(Value) || + !cast<ConstantSDNode>(Value)->isOpaque())) { + APInt TruncDemandedBits = + APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), + MST->getMemoryVT().getScalarSizeInBits()); + + // See if we can simplify the operation with + // SimplifyDemandedBits, which only works if the value has a single use. + if (SimplifyDemandedBits(Value, TruncDemandedBits)) { + // Re-visit the store if anything changed and the store hasn't been merged + // with another node (N is deleted) SimplifyDemandedBits will add Value's + // node back to the worklist if necessary, but we also need to re-visit + // the Store node itself. + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); + return SDValue(N, 0); + } + } + + // If this is a TRUNC followed by a masked store, fold this into a masked + // truncating store. We can do this even if this is already a masked + // truncstore. + if ((Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() && + MST->isUnindexed() && + TLI.canCombineTruncStore(Value.getOperand(0).getValueType(), + MST->getMemoryVT(), LegalOperations)) { + auto Mask = TLI.promoteTargetBoolean(DAG, MST->getMask(), + Value.getOperand(0).getValueType()); + return DAG.getMaskedStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + MST->getOffset(), Mask, MST->getMemoryVT(), + MST->getMemOperand(), MST->getAddressingMode(), + /*IsTruncating=*/true); + } + return SDValue(); } @@ -10109,8 +10146,10 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) { // FIXME: Can we do this for indexed, expanding, or extending loads? if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MLD->isUnindexed() && !MLD->isExpandingLoad() && MLD->getExtensionType() == ISD::NON_EXTLOAD) { - SDValue NewLd = DAG.getLoad(N->getValueType(0), SDLoc(N), MLD->getChain(), - MLD->getBasePtr(), MLD->getMemOperand()); + SDValue NewLd = DAG.getLoad( + N->getValueType(0), SDLoc(N), MLD->getChain(), MLD->getBasePtr(), + MLD->getPointerInfo(), MLD->getOriginalAlign(), + MachineMemOperand::MOLoad, MLD->getAAInfo(), MLD->getRanges()); return CombineTo(N, NewLd, NewLd.getValue(1)); } @@ -13876,19 +13915,19 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; - // fold vector ops - if (VT.isVector()) - if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) - return FoldedVOp; - // fold (fadd c1, c2) -> c1 + c2 - if (N0CFP && N1CFP) - return DAG.getNode(ISD::FADD, DL, VT, N0, N1); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FADD, DL, VT, {N0, N1})) + return C; // canonicalize constant to RHS if (N0CFP && !N1CFP) return DAG.getNode(ISD::FADD, DL, VT, N1, N0); + // fold vector ops + if (VT.isVector()) + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math) ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true); if (N1C && N1C->isZero()) @@ -14084,15 +14123,15 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; + // fold (fsub c1, c2) -> c1-c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FSUB, DL, VT, {N0, N1})) + return C; + // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) return FoldedVOp; - // fold (fsub c1, c2) -> c1-c2 - if (N0CFP && N1CFP) - return DAG.getNode(ISD::FSUB, DL, VT, N0, N1); - if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -14157,7 +14196,6 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true); ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true); EVT VT = N->getValueType(0); SDLoc DL(N); @@ -14168,22 +14206,20 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; - // fold vector ops - if (VT.isVector()) { - // This just handles C1 * C2 for vectors. Other vector folds are below. - if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) - return FoldedVOp; - } - // fold (fmul c1, c2) -> c1*c2 - if (N0CFP && N1CFP) - return DAG.getNode(ISD::FMUL, DL, VT, N0, N1); + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FMUL, DL, VT, {N0, N1})) + return C; // canonicalize constant to RHS if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && !DAG.isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMUL, DL, VT, N1, N0); + // fold vector ops + if (VT.isVector()) + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -14495,8 +14531,6 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { SDValue DAGCombiner::visitFDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); - ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; @@ -14506,15 +14540,15 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags)) return R; + // fold (fdiv c1, c2) -> c1/c2 + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FDIV, DL, VT, {N0, N1})) + return C; + // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) return FoldedVOp; - // fold (fdiv c1, c2) -> c1/c2 - if (N0CFP && N1CFP) - return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1); - if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -14523,7 +14557,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. - if (N1CFP) { + if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) { // Compute the reciprocal 1.0 / c2. const APFloat &N1APF = N1CFP->getValueAPF(); APFloat Recip(N1APF.getSemantics(), 1); // 1.0 @@ -14639,8 +14673,6 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { SDValue DAGCombiner::visitFREM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0); - ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1); EVT VT = N->getValueType(0); SDNodeFlags Flags = N->getFlags(); SelectionDAG::FlagInserter FlagsInserter(DAG, N); @@ -14649,9 +14681,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) { return R; // fold (frem c1, c2) -> fmod(c1,c2) - if (N0CFP && N1CFP) - return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1); - + if (SDValue C = DAG.FoldConstantArithmetic(ISD::FREM, SDLoc(N), VT, {N0, N1})) + return C; + if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -14712,12 +14744,12 @@ static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0); - bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); - if (N0CFP && N1CFP) // Constant fold - return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1); + // fold (fcopysign c1, c2) -> fcopysign(c1,c2) + if (SDValue C = + DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, SDLoc(N), VT, {N0, N1})) + return C; if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) { const APFloat &V = N1C->getValueAPF(); @@ -14835,14 +14867,6 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { - // This optimization is guarded by a function attribute because it may produce - // unexpected results. Ie, programs may be relying on the platform-specific - // undefined behavior when the float-to-int conversion overflows. - const Function &F = DAG.getMachineFunction().getFunction(); - Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow"); - if (StrictOverflow.getValueAsString().equals("false")) - return SDValue(); - // We only do this if the target has legal ftrunc. Otherwise, we'd likely be // replacing casts with a libcall. We also must be allowed to ignore -0.0 // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer @@ -15216,31 +15240,26 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { return SDValue(); } -static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, - APFloat (*Op)(const APFloat &, const APFloat &)) { +SDValue DAGCombiner::visitFMinMax(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); - const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); const SDNodeFlags Flags = N->getFlags(); unsigned Opc = N->getOpcode(); bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM; bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM; SelectionDAG::FlagInserter FlagsInserter(DAG, N); - if (N0CFP && N1CFP) { - const APFloat &C0 = N0CFP->getValueAPF(); - const APFloat &C1 = N1CFP->getValueAPF(); - return DAG.getConstantFP(Op(C0, C1), SDLoc(N), VT); - } + // Constant fold. + if (SDValue C = DAG.FoldConstantArithmetic(Opc, SDLoc(N), VT, {N0, N1})) + return C; // Canonicalize to constant on RHS. if (DAG.isConstantFPBuildVectorOrConstantFP(N0) && !DAG.isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); - if (N1CFP) { + if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) { const APFloat &AF = N1CFP->getValueAPF(); // minnum(X, nan) -> X @@ -15272,22 +15291,6 @@ static SDValue visitFMinMax(SelectionDAG &DAG, SDNode *N, return SDValue(); } -SDValue DAGCombiner::visitFMINNUM(SDNode *N) { - return visitFMinMax(DAG, N, minnum); -} - -SDValue DAGCombiner::visitFMAXNUM(SDNode *N) { - return visitFMinMax(DAG, N, maxnum); -} - -SDValue DAGCombiner::visitFMINIMUM(SDNode *N) { - return visitFMinMax(DAG, N, minimum); -} - -SDValue DAGCombiner::visitFMAXIMUM(SDNode *N) { - return visitFMinMax(DAG, N, maximum); -} - SDValue DAGCombiner::visitFABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -18392,8 +18395,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) { if (ST->isUnindexed() && ST->isSimple() && ST1->isUnindexed() && ST1->isSimple()) { - if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value && - ST->getMemoryVT() == ST1->getMemoryVT() && + if (OptLevel != CodeGenOpt::None && ST1->getBasePtr() == Ptr && + ST1->getValue() == Value && ST->getMemoryVT() == ST1->getMemoryVT() && ST->getAddressSpace() == ST1->getAddressSpace()) { // If this is a store followed by a store with the same value to the // same location, then the store is dead/noop. @@ -20727,6 +20730,156 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { return NewLd; } +/// Given EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(Op0, Op1, Mask)), +/// try to produce VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(Op?, ?), +/// EXTRACT_SUBVECTOR(Op?, ?), +/// Mask')) +/// iff it is legal and profitable to do so. Notably, the trimmed mask +/// (containing only the elements that are extracted) +/// must reference at most two subvectors. +static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N, + SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalOperations) { + assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR && + "Must only be called on EXTRACT_SUBVECTOR's"); + + SDValue N0 = N->getOperand(0); + + // Only deal with non-scalable vectors. + EVT NarrowVT = N->getValueType(0); + EVT WideVT = N0.getValueType(); + if (!NarrowVT.isFixedLengthVector() || !WideVT.isFixedLengthVector()) + return SDValue(); + + // The operand must be a shufflevector. + auto *WideShuffleVector = dyn_cast<ShuffleVectorSDNode>(N0); + if (!WideShuffleVector) + return SDValue(); + + // The old shuffleneeds to go away. + if (!WideShuffleVector->hasOneUse()) + return SDValue(); + + // And the narrow shufflevector that we'll form must be legal. + if (LegalOperations && + !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, NarrowVT)) + return SDValue(); + + uint64_t FirstExtractedEltIdx = N->getConstantOperandVal(1); + int NumEltsExtracted = NarrowVT.getVectorNumElements(); + assert((FirstExtractedEltIdx % NumEltsExtracted) == 0 && + "Extract index is not a multiple of the output vector length."); + + int WideNumElts = WideVT.getVectorNumElements(); + + SmallVector<int, 16> NewMask; + NewMask.reserve(NumEltsExtracted); + SmallSetVector<std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>, 2> + DemandedSubvectors; + + // Try to decode the wide mask into narrow mask from at most two subvectors. + for (int M : WideShuffleVector->getMask().slice(FirstExtractedEltIdx, + NumEltsExtracted)) { + assert((M >= -1) && (M < (2 * WideNumElts)) && + "Out-of-bounds shuffle mask?"); + + if (M < 0) { + // Does not depend on operands, does not require adjustment. + NewMask.emplace_back(M); + continue; + } + + // From which operand of the shuffle does this shuffle mask element pick? + int WideShufOpIdx = M / WideNumElts; + // Which element of that operand is picked? + int OpEltIdx = M % WideNumElts; + + assert((OpEltIdx + WideShufOpIdx * WideNumElts) == M && + "Shuffle mask vector decomposition failure."); + + // And which NumEltsExtracted-sized subvector of that operand is that? + int OpSubvecIdx = OpEltIdx / NumEltsExtracted; + // And which element within that subvector of that operand is that? + int OpEltIdxInSubvec = OpEltIdx % NumEltsExtracted; + + assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted) == OpEltIdx && + "Shuffle mask subvector decomposition failure."); + + assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted + + WideShufOpIdx * WideNumElts) == M && + "Shuffle mask full decomposition failure."); + + SDValue Op = WideShuffleVector->getOperand(WideShufOpIdx); + + if (Op.isUndef()) { + // Picking from an undef operand. Let's adjust mask instead. + NewMask.emplace_back(-1); + continue; + } + + // Profitability check: only deal with extractions from the first subvector. + if (OpSubvecIdx != 0) + return SDValue(); + + const std::pair<SDValue, int> DemandedSubvector = + std::make_pair(Op, OpSubvecIdx); + + if (DemandedSubvectors.insert(DemandedSubvector)) { + if (DemandedSubvectors.size() > 2) + return SDValue(); // We can't handle more than two subvectors. + // How many elements into the WideVT does this subvector start? + int Index = NumEltsExtracted * OpSubvecIdx; + // Bail out if the extraction isn't going to be cheap. + if (!TLI.isExtractSubvectorCheap(NarrowVT, WideVT, Index)) + return SDValue(); + } + + // Ok, but from which operand of the new shuffle will this element pick? + int NewOpIdx = + getFirstIndexOf(DemandedSubvectors.getArrayRef(), DemandedSubvector); + assert((NewOpIdx == 0 || NewOpIdx == 1) && "Unexpected operand index."); + + int AdjM = OpEltIdxInSubvec + NewOpIdx * NumEltsExtracted; + NewMask.emplace_back(AdjM); + } + assert(NewMask.size() == (unsigned)NumEltsExtracted && "Produced bad mask."); + assert(DemandedSubvectors.size() <= 2 && + "Should have ended up demanding at most two subvectors."); + + // Did we discover that the shuffle does not actually depend on operands? + if (DemandedSubvectors.empty()) + return DAG.getUNDEF(NarrowVT); + + // We still perform the exact same EXTRACT_SUBVECTOR, just on different + // operand[s]/index[es], so there is no point in checking for it's legality. + + // Do not turn a legal shuffle into an illegal one. + if (TLI.isShuffleMaskLegal(WideShuffleVector->getMask(), WideVT) && + !TLI.isShuffleMaskLegal(NewMask, NarrowVT)) + return SDValue(); + + SDLoc DL(N); + + SmallVector<SDValue, 2> NewOps; + for (const std::pair<SDValue /*Op*/, int /*SubvectorIndex*/> + &DemandedSubvector : DemandedSubvectors) { + // How many elements into the WideVT does this subvector start? + int Index = NumEltsExtracted * DemandedSubvector.second; + SDValue IndexC = DAG.getVectorIdxConstant(Index, DL); + NewOps.emplace_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT, + DemandedSubvector.first, IndexC)); + } + assert((NewOps.size() == 1 || NewOps.size() == 2) && + "Should end up with either one or two ops"); + + // If we ended up with only one operand, pad with an undef. + if (NewOps.size() == 1) + NewOps.emplace_back(DAG.getUNDEF(NarrowVT)); + + return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask); +} + SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); @@ -20840,6 +20993,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { } } + if (SDValue V = + foldExtractSubvectorFromShuffleVector(N, DAG, TLI, LegalOperations)) + return V; + V = peekThroughBitcasts(V); // If the input is a build vector. Try to make a smaller build vector. @@ -22424,15 +22581,9 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N, const SDLoc &DL) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - SDValue Ops[] = {LHS, RHS}; unsigned Opcode = N->getOpcode(); SDNodeFlags Flags = N->getFlags(); - // See if we can constant fold the vector operation. - if (SDValue Fold = DAG.FoldConstantArithmetic(Opcode, SDLoc(LHS), - LHS.getValueType(), Ops)) - return Fold; - // Move unary shuffles with identical masks after a vector binop: // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask)) // --> shuffle (VBinOp A, B), Undef, Mask diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 08598eeded7a..5dfb65ef131a 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3367,13 +3367,13 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { } case ISD::FSHL: case ISD::FSHR: - if (TLI.expandFunnelShift(Node, Tmp1, DAG)) - Results.push_back(Tmp1); + if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG)) + Results.push_back(Expanded); break; case ISD::ROTL: case ISD::ROTR: - if (TLI.expandROT(Node, true /*AllowVectorOps*/, Tmp1, DAG)) - Results.push_back(Tmp1); + if (SDValue Expanded = TLI.expandROT(Node, true /*AllowVectorOps*/, DAG)) + Results.push_back(Expanded); break; case ISD::SADDSAT: case ISD::UADDSAT: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 1fa4d88fcb4a..518e525e13d0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1277,8 +1277,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N, bool IsVP) { SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) { // Lower the rotate to shifts and ORs which can be promoted. - SDValue Res; - TLI.expandROT(N, true /*AllowVectorOps*/, Res, DAG); + SDValue Res = TLI.expandROT(N, true /*AllowVectorOps*/, DAG); ReplaceValueWith(SDValue(N, 0), Res); return SDValue(); } @@ -1286,7 +1285,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Rotate(SDNode *N) { SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) { SDValue Hi = GetPromotedInteger(N->getOperand(0)); SDValue Lo = GetPromotedInteger(N->getOperand(1)); - SDValue Amount = GetPromotedInteger(N->getOperand(2)); + SDValue Amt = GetPromotedInteger(N->getOperand(2)); SDLoc DL(N); EVT OldVT = N->getOperand(0).getValueType(); @@ -1297,21 +1296,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) { unsigned NewBits = VT.getScalarSizeInBits(); // Amount has to be interpreted modulo the old bit width. - Amount = - DAG.getNode(ISD::UREM, DL, VT, Amount, DAG.getConstant(OldBits, DL, VT)); + Amt = DAG.getNode(ISD::UREM, DL, VT, Amt, DAG.getConstant(OldBits, DL, VT)); // If the promoted type is twice the size (or more), then we use the // traditional funnel 'double' shift codegen. This isn't necessary if the // shift amount is constant. // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z % bw)) >> bw. // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z % bw)). - if (NewBits >= (2 * OldBits) && !isa<ConstantSDNode>(Amount) && + if (NewBits >= (2 * OldBits) && !isa<ConstantSDNode>(Amt) && !TLI.isOperationLegalOrCustom(Opcode, VT)) { SDValue HiShift = DAG.getConstant(OldBits, DL, VT); Hi = DAG.getNode(ISD::SHL, DL, VT, Hi, HiShift); Lo = DAG.getZeroExtendInReg(Lo, DL, OldVT); SDValue Res = DAG.getNode(ISD::OR, DL, VT, Hi, Lo); - Res = DAG.getNode(IsFSHR ? ISD::SRL : ISD::SHL, DL, VT, Res, Amount); + Res = DAG.getNode(IsFSHR ? ISD::SRL : ISD::SHL, DL, VT, Res, Amt); if (!IsFSHR) Res = DAG.getNode(ISD::SRL, DL, VT, Res, HiShift); return Res; @@ -1324,9 +1322,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FunnelShift(SDNode *N) { // Increase Amount to shift the result into the lower bits of the promoted // type. if (IsFSHR) - Amount = DAG.getNode(ISD::ADD, DL, VT, Amount, ShiftOffset); + Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, ShiftOffset); - return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amount); + return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amt); } SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp index 98312f91d8c0..03dcd0f6d2c9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -83,7 +83,7 @@ void DAGTypeLegalizer::PerformExpensiveChecks() { SDValue Res(&Node, i); bool Failed = false; // Don't create a value in map. - auto ResId = (ValueToIdMap.count(Res)) ? ValueToIdMap[Res] : 0; + auto ResId = ValueToIdMap.lookup(Res); unsigned Mapped = 0; if (ResId && (ReplacedValues.find(ResId) != ReplacedValues.end())) { @@ -301,7 +301,7 @@ ScanOperands: if (IgnoreNodeResults(N->getOperand(i).getNode())) continue; - const auto Op = N->getOperand(i); + const auto &Op = N->getOperand(i); LLVM_DEBUG(dbgs() << "Analyzing operand: "; Op.dump(&DAG)); EVT OpVT = Op.getValueType(); switch (getTypeAction(OpVT)) { @@ -1007,11 +1007,7 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) { /// /// ValVT is the type of values that produced the boolean. SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) { - SDLoc dl(Bool); - EVT BoolVT = getSetCCResultType(ValVT); - ISD::NodeType ExtendCode = - TargetLowering::getExtendForContent(TLI.getBooleanContents(ValVT)); - return DAG.getNode(ExtendCode, dl, BoolVT, Bool); + return TLI.promoteTargetBoolean(DAG, Bool, ValVT); } /// Return the lower LoVT bits of Op in Lo and the upper HiVT bits in Hi. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 88a28a3be53e..1493f36fcd3e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -254,69 +254,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { SDNode *Node = DAG.UpdateNodeOperands(Op.getNode(), Ops); - if (Op.getOpcode() == ISD::LOAD) { - LoadSDNode *LD = cast<LoadSDNode>(Node); - ISD::LoadExtType ExtType = LD->getExtensionType(); - if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD) { - LLVM_DEBUG(dbgs() << "\nLegalizing extending vector load: "; - Node->dump(&DAG)); - switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getValueType(0), - LD->getMemoryVT())) { - default: llvm_unreachable("This action is not supported yet!"); - case TargetLowering::Legal: - return TranslateLegalizeResults(Op, Node); - case TargetLowering::Custom: { - SmallVector<SDValue, 2> ResultVals; - if (LowerOperationWrapper(Node, ResultVals)) { - if (ResultVals.empty()) - return TranslateLegalizeResults(Op, Node); - - Changed = true; - return RecursivelyLegalizeResults(Op, ResultVals); - } - LLVM_FALLTHROUGH; - } - case TargetLowering::Expand: { - Changed = true; - std::pair<SDValue, SDValue> Tmp = ExpandLoad(Node); - AddLegalizedOperand(Op.getValue(0), Tmp.first); - AddLegalizedOperand(Op.getValue(1), Tmp.second); - return Op.getResNo() ? Tmp.first : Tmp.second; - } - } - } - } else if (Op.getOpcode() == ISD::STORE) { - StoreSDNode *ST = cast<StoreSDNode>(Node); - EVT StVT = ST->getMemoryVT(); - MVT ValVT = ST->getValue().getSimpleValueType(); - if (StVT.isVector() && ST->isTruncatingStore()) { - LLVM_DEBUG(dbgs() << "\nLegalizing truncating vector store: "; - Node->dump(&DAG)); - switch (TLI.getTruncStoreAction(ValVT, StVT)) { - default: llvm_unreachable("This action is not supported yet!"); - case TargetLowering::Legal: - return TranslateLegalizeResults(Op, Node); - case TargetLowering::Custom: { - SmallVector<SDValue, 1> ResultVals; - if (LowerOperationWrapper(Node, ResultVals)) { - if (ResultVals.empty()) - return TranslateLegalizeResults(Op, Node); - - Changed = true; - return RecursivelyLegalizeResults(Op, ResultVals); - } - LLVM_FALLTHROUGH; - } - case TargetLowering::Expand: { - Changed = true; - SDValue Chain = ExpandStore(Node); - AddLegalizedOperand(Op, Chain); - return Chain; - } - } - } - } - bool HasVectorValueOrOp = llvm::any_of(Node->values(), [](EVT T) { return T.isVector(); }) || llvm::any_of(Node->op_values(), @@ -329,6 +266,22 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { switch (Op.getOpcode()) { default: return TranslateLegalizeResults(Op, Node); + case ISD::LOAD: { + LoadSDNode *LD = cast<LoadSDNode>(Node); + ISD::LoadExtType ExtType = LD->getExtensionType(); + EVT LoadedVT = LD->getMemoryVT(); + if (LoadedVT.isVector() && ExtType != ISD::NON_EXTLOAD) + Action = TLI.getLoadExtAction(ExtType, LD->getValueType(0), LoadedVT); + break; + } + case ISD::STORE: { + StoreSDNode *ST = cast<StoreSDNode>(Node); + EVT StVT = ST->getMemoryVT(); + MVT ValVT = ST->getValue().getSimpleValueType(); + if (StVT.isVector() && ST->isTruncatingStore()) + Action = TLI.getTruncStoreAction(ValVT, StVT); + break; + } case ISD::MERGE_VALUES: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); // This operation lies about being legal: when it claims to be legal, @@ -512,6 +465,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { switch (Action) { default: llvm_unreachable("This action is not supported yet!"); case TargetLowering::Promote: + assert((Op.getOpcode() != ISD::LOAD && Op.getOpcode() != ISD::STORE) && + "This action is not supported yet!"); LLVM_DEBUG(dbgs() << "Promoting\n"); Promote(Node, ResultVals); assert(!ResultVals.empty() && "No results for promotion?"); @@ -731,8 +686,16 @@ SDValue VectorLegalizer::ExpandStore(SDNode *N) { } void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { - SDValue Tmp; switch (Node->getOpcode()) { + case ISD::LOAD: { + std::pair<SDValue, SDValue> Tmp = ExpandLoad(Node); + Results.push_back(Tmp.first); + Results.push_back(Tmp.second); + return; + } + case ISD::STORE: + Results.push_back(ExpandStore(Node)); + return; case ISD::MERGE_VALUES: for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) Results.push_back(Node->getOperand(i)); @@ -804,15 +767,15 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { break; case ISD::FSHL: case ISD::FSHR: - if (TLI.expandFunnelShift(Node, Tmp, DAG)) { - Results.push_back(Tmp); + if (SDValue Expanded = TLI.expandFunnelShift(Node, DAG)) { + Results.push_back(Expanded); return; } break; case ISD::ROTL: case ISD::ROTR: - if (TLI.expandROT(Node, false /*AllowVectorOps*/, Tmp, DAG)) { - Results.push_back(Tmp); + if (SDValue Expanded = TLI.expandROT(Node, false /*AllowVectorOps*/, DAG)) { + Results.push_back(Expanded); return; } break; diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp index 2695ed36991c..3d5c4c5b1cae 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp @@ -168,10 +168,9 @@ void ResourcePriorityQueue::initNodes(std::vector<SUnit> &sunits) { SUnits = &sunits; NumNodesSolelyBlocking.resize(SUnits->size(), 0); - for (unsigned i = 0, e = SUnits->size(); i != e; ++i) { - SUnit *SU = &(*SUnits)[i]; - initNumRegDefsLeft(SU); - SU->NodeQueueId = 0; + for (SUnit &SU : *SUnits) { + initNumRegDefsLeft(&SU); + SU.NodeQueueId = 0; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 84e6d2a16422..aec2cf38b400 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -442,33 +442,32 @@ void ScheduleDAGSDNodes::AddSchedEdges() { bool UnitLatencies = forceUnitLatencies(); // Pass 2: add the preds, succs, etc. - for (unsigned su = 0, e = SUnits.size(); su != e; ++su) { - SUnit *SU = &SUnits[su]; - SDNode *MainNode = SU->getNode(); + for (SUnit &SU : SUnits) { + SDNode *MainNode = SU.getNode(); if (MainNode->isMachineOpcode()) { unsigned Opc = MainNode->getMachineOpcode(); const MCInstrDesc &MCID = TII->get(Opc); for (unsigned i = 0; i != MCID.getNumOperands(); ++i) { if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) { - SU->isTwoAddress = true; + SU.isTwoAddress = true; break; } } if (MCID.isCommutable()) - SU->isCommutable = true; + SU.isCommutable = true; } // Find all predecessors and successors of the group. - for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) { + for (SDNode *N = SU.getNode(); N; N = N->getGluedNode()) { if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).getImplicitDefs()) { - SU->hasPhysRegClobbers = true; + SU.hasPhysRegClobbers = true; unsigned NumUsed = InstrEmitter::CountResults(N); while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1)) --NumUsed; // Skip over unused values at the end. if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs()) - SU->hasPhysRegDefs = true; + SU.hasPhysRegDefs = true; } for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { @@ -477,7 +476,8 @@ void ScheduleDAGSDNodes::AddSchedEdges() { if (isPassiveNode(OpN)) continue; // Not scheduled. SUnit *OpSU = &SUnits[OpN->getNodeId()]; assert(OpSU && "Node has no SUnit!"); - if (OpSU == SU) continue; // In the same group. + if (OpSU == &SU) + continue; // In the same group. EVT OpVT = N->getOperand(i).getValueType(); assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!"); @@ -508,10 +508,10 @@ void ScheduleDAGSDNodes::AddSchedEdges() { Dep.setLatency(OpLatency); if (!isChain && !UnitLatencies) { computeOperandLatency(OpN, N, i, Dep); - ST.adjustSchedDependency(OpSU, DefIdx, SU, i, Dep); + ST.adjustSchedDependency(OpSU, DefIdx, &SU, i, Dep); } - if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) { + if (!SU.addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) { // Multiple register uses are combined in the same SUnit. For example, // we could have a set of glued nodes with all their defs consumed by // another set of glued nodes. Register pressure tracking sees this as @@ -721,10 +721,7 @@ void ScheduleDAGSDNodes::dumpSchedule() const { /// void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) { unsigned ScheduledNodes = ScheduleDAG::VerifyScheduledDAG(isBottomUp); - unsigned Noops = 0; - for (unsigned i = 0, e = Sequence.size(); i != e; ++i) - if (!Sequence[i]) - ++Noops; + unsigned Noops = llvm::count(Sequence, nullptr); assert(Sequence.size() - Noops == ScheduledNodes && "The number of nodes scheduled doesn't match the expected number!"); } @@ -911,8 +908,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) { } } - for (unsigned i = 0, e = Sequence.size(); i != e; i++) { - SUnit *SU = Sequence[i]; + for (SUnit *SU : Sequence) { if (!SU) { // Null SUnit* is a noop. TII->insertNoop(*Emitter.getBlock(), InsertPos); diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp index 540a6e3efbe1..10940478010e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp @@ -169,11 +169,11 @@ void ScheduleDAGVLIW::listScheduleTopDown() { releaseSuccessors(&EntrySU); // All leaves to AvailableQueue. - for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { + for (SUnit &SU : SUnits) { // It is available if it has no predecessors. - if (SUnits[i].Preds.empty()) { - AvailableQueue->push(&SUnits[i]); - SUnits[i].isAvailable = true; + if (SU.Preds.empty()) { + AvailableQueue->push(&SU); + SU.isAvailable = true; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index c282e03387dd..2ae0d4df7b77 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2499,7 +2499,8 @@ bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask, /// sense to specify which elements are demanded or undefined, therefore /// they are simply ignored. bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, - APInt &UndefElts, unsigned Depth) { + APInt &UndefElts, unsigned Depth) const { + unsigned Opcode = V.getOpcode(); EVT VT = V.getValueType(); assert(VT.isVector() && "Vector type expected"); @@ -2511,7 +2512,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, // Deal with some common cases here that work for both fixed and scalable // vector types. - switch (V.getOpcode()) { + switch (Opcode) { case ISD::SPLAT_VECTOR: UndefElts = V.getOperand(0).isUndef() ? APInt::getAllOnes(DemandedElts.getBitWidth()) @@ -2537,7 +2538,12 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return isSplatValue(V.getOperand(0), DemandedElts, UndefElts, Depth + 1); - } + default: + if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::INTRINSIC_WO_CHAIN || + Opcode == ISD::INTRINSIC_W_CHAIN || Opcode == ISD::INTRINSIC_VOID) + return TLI->isSplatValueForTargetNode(V, DemandedElts, UndefElts, Depth); + break; +} // We don't support other cases than those above for scalable vectors at // the moment. @@ -2548,7 +2554,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch"); UndefElts = APInt::getZero(NumElts); - switch (V.getOpcode()) { + switch (Opcode) { case ISD::BUILD_VECTOR: { SDValue Scl; for (unsigned i = 0; i != NumElts; ++i) { @@ -2600,13 +2606,30 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, } break; } + case ISD::ANY_EXTEND_VECTOR_INREG: + case ISD::SIGN_EXTEND_VECTOR_INREG: + case ISD::ZERO_EXTEND_VECTOR_INREG: { + // Widen the demanded elts by the src element count. + SDValue Src = V.getOperand(0); + // We don't support scalable vectors at the moment. + if (Src.getValueType().isScalableVector()) + return false; + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + APInt UndefSrcElts; + APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts); + if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts, Depth + 1)) { + UndefElts = UndefSrcElts.truncOrSelf(NumElts); + return true; + } + break; + } } return false; } /// Helper wrapper to main isSplatValue function. -bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) { +bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) const { EVT VT = V.getValueType(); assert(VT.isVector() && "Vector type expected"); @@ -5291,9 +5314,10 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, if (isUndef(Opcode, Ops)) return getUNDEF(VT); - // Handle the case of two scalars. + // Handle binops special cases. if (NumOps == 2) { - // TODO: Move foldConstantFPMath here? + if (SDValue CFP = foldConstantFPMath(Opcode, DL, VT, Ops[0], Ops[1])) + return CFP; if (auto *C1 = dyn_cast<ConstantSDNode>(Ops[0])) { if (auto *C2 = dyn_cast<ConstantSDNode>(Ops[1])) { @@ -5463,10 +5487,11 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL, // should. That will require dealing with a potentially non-default // rounding mode, checking the "opStatus" return value from the APFloat // math calculations, and possibly other variations. - auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1.getNode()); - auto *N2CFP = dyn_cast<ConstantFPSDNode>(N2.getNode()); + ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, /*AllowUndefs*/ false); + ConstantFPSDNode *N2CFP = isConstOrConstSplatFP(N2, /*AllowUndefs*/ false); if (N1CFP && N2CFP) { - APFloat C1 = N1CFP->getValueAPF(), C2 = N2CFP->getValueAPF(); + APFloat C1 = N1CFP->getValueAPF(); // make copy + const APFloat &C2 = N2CFP->getValueAPF(); switch (Opcode) { case ISD::FADD: C1.add(C2, APFloat::rmNearestTiesToEven); @@ -5486,6 +5511,14 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL, case ISD::FCOPYSIGN: C1.copySign(C2); return getConstantFP(C1, DL, VT); + case ISD::FMINNUM: + return getConstantFP(minnum(C1, C2), DL, VT); + case ISD::FMAXNUM: + return getConstantFP(maxnum(C1, C2), DL, VT); + case ISD::FMINIMUM: + return getConstantFP(minimum(C1, C2), DL, VT); + case ISD::FMAXIMUM: + return getConstantFP(maximum(C1, C2), DL, VT); default: break; } } @@ -5502,8 +5535,9 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL, switch (Opcode) { case ISD::FSUB: // -0.0 - undef --> undef (consistent with "fneg undef") - if (N1CFP && N1CFP->getValueAPF().isNegZero() && N2.isUndef()) - return getUNDEF(VT); + if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, /*AllowUndefs*/ true)) + if (N1C && N1C->getValueAPF().isNegZero() && N2.isUndef()) + return getUNDEF(VT); LLVM_FALLTHROUGH; case ISD::FADD: @@ -5962,9 +5996,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2})) return SV; - if (SDValue V = foldConstantFPMath(Opcode, DL, VT, N1, N2)) - return V; - // Canonicalize an UNDEF to the RHS, even over a constant. if (N1.isUndef()) { if (TLI->isCommutativeBinOp(Opcode)) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 7726a0007e44..63cd723cf6da 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1036,7 +1036,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, AA = aa; GFI = gfi; LibInfo = li; - DL = &DAG.getDataLayout(); Context = DAG.getContext(); LPadToCallSiteMap.clear(); SL->init(DAG.getTargetLoweringInfo(), TM, DAG.getDataLayout()); @@ -1626,6 +1625,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { if (const auto *Equiv = dyn_cast<DSOLocalEquivalent>(C)) return getValue(Equiv->getGlobalValue()); + if (const auto *NC = dyn_cast<NoCFIValue>(C)) + return getValue(NC->getGlobalValue()); + VectorType *VecTy = cast<VectorType>(V->getType()); // Now that we know the number and type of the elements, get that number of @@ -1921,8 +1923,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) { DAG.getDataLayout().getAllocaAddrSpace()), PtrValueVTs); - SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), - DemoteReg, PtrValueVTs[0]); + SDValue RetPtr = + DAG.getCopyFromReg(Chain, getCurSDLoc(), DemoteReg, PtrValueVTs[0]); SDValue RetOp = getValue(I.getOperand(0)); SmallVector<EVT, 4> ValueVTs, MemVTs; @@ -2657,7 +2659,8 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD, SDLoc dl = getCurSDLoc(); SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy); const Module &M = *ParentBB->getParent()->getFunction().getParent(); - Align Align = DL->getPrefTypeAlign(Type::getInt8PtrTy(M.getContext())); + Align Align = + DAG.getDataLayout().getPrefTypeAlign(Type::getInt8PtrTy(M.getContext())); // Generate code to load the content of the guard slot. SDValue GuardVal = DAG.getLoad( @@ -3058,14 +3061,14 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) { void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last) { // Update JTCases. - for (unsigned i = 0, e = SL->JTCases.size(); i != e; ++i) - if (SL->JTCases[i].first.HeaderBB == First) - SL->JTCases[i].first.HeaderBB = Last; + for (JumpTableBlock &JTB : SL->JTCases) + if (JTB.first.HeaderBB == First) + JTB.first.HeaderBB = Last; // Update BitTestCases. - for (unsigned i = 0, e = SL->BitTestCases.size(); i != e; ++i) - if (SL->BitTestCases[i].Parent == First) - SL->BitTestCases[i].Parent = Last; + for (BitTestBlock &BTB : SL->BitTestCases) + if (BTB.Parent == First) + BTB.Parent = Last; } void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) { @@ -3111,6 +3114,8 @@ void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) { void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) { SDNodeFlags Flags; + if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) + Flags.copyFMF(*FPOp); SDValue Op = getValue(I.getOperand(0)); SDValue UnNodeValue = DAG.getNode(Opcode, getCurSDLoc(), Op.getValueType(), @@ -3881,7 +3886,8 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue(); if (Field) { // N = N + Offset - uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field); + uint64_t Offset = + DAG.getDataLayout().getStructLayout(StTy)->getElementOffset(Field); // In an inbounds GEP with an offset that is nonnegative even when // interpreted as signed, assume there is no unsigned overflow. @@ -3898,7 +3904,8 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) { // (and fix up the result later). unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS); MVT IdxTy = MVT::getIntegerVT(IdxSize); - TypeSize ElementSize = DL->getTypeAllocSize(GTI.getIndexedType()); + TypeSize ElementSize = + DAG.getDataLayout().getTypeAllocSize(GTI.getIndexedType()); // We intentionally mask away the high bits here; ElementSize may not // fit in IdxTy. APInt ElementMul(IdxSize, ElementSize.getKnownMinSize()); @@ -4788,7 +4795,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I, } // Use TargetConstant instead of a regular constant for immarg. - EVT VT = TLI.getValueType(*DL, Arg->getType(), true); + EVT VT = TLI.getValueType(DAG.getDataLayout(), Arg->getType(), true); if (const ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) { assert(CI->getBitWidth() <= 64 && "large intrinsic immediates not handled"); @@ -6571,7 +6578,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } else { EVT PtrTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); const Value *Global = TLI.getSDagStackGuard(M); - Align Align = DL->getPrefTypeAlign(Global->getType()); + Align Align = DAG.getDataLayout().getPrefTypeAlign(Global->getType()); Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global), MachinePointerInfo(Global, 0), Align, MachineMemOperand::MOVolatile); @@ -7127,12 +7134,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, } SDValue VectorStep = DAG.getStepVector(sdl, VecTy); SDValue VectorInduction = DAG.getNode( - ISD::UADDO, sdl, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep); - SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction.getValue(0), + ISD::UADDSAT, sdl, VecTy, VectorIndex, VectorStep); + SDValue SetCC = DAG.getSetCC(sdl, CCVT, VectorInduction, VectorTripCount, ISD::CondCode::SETULT); - setValue(&I, DAG.getNode(ISD::AND, sdl, CCVT, - DAG.getNOT(sdl, VectorInduction.getValue(1), CCVT), - SetCC)); + setValue(&I, SetCC); return; } case Intrinsic::experimental_vector_insert: { @@ -7317,32 +7322,26 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) { void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT, SmallVector<SDValue, 7> &OpValues, - bool isGather) { + bool IsGather) { SDLoc DL = getCurSDLoc(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); Value *PtrOperand = VPIntrin.getArgOperand(0); - MaybeAlign Alignment = DAG.getEVTAlign(VT); + MaybeAlign Alignment = VPIntrin.getPointerAlignment(); + if (!Alignment) + Alignment = DAG.getEVTAlign(VT); AAMDNodes AAInfo = VPIntrin.getAAMetadata(); const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range); SDValue LD; bool AddToChain = true; - if (!isGather) { + if (!IsGather) { // Do not serialize variable-length loads of constant memory with // anything. - MemoryLocation ML; - if (VT.isScalableVector()) - ML = MemoryLocation::getAfter(PtrOperand); - else - ML = MemoryLocation( - PtrOperand, - LocationSize::precise( - DAG.getDataLayout().getTypeStoreSize(VPIntrin.getType())), - AAInfo); + MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo); AddToChain = !AA || !AA->pointsToConstantMemory(ML); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad, - VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo, Ranges); + MemoryLocation::UnknownSize, *Alignment, AAInfo, Ranges); LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2], MMO, false /*IsExpanding */); } else { @@ -7380,18 +7379,20 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT, void SelectionDAGBuilder::visitVPStoreScatter(const VPIntrinsic &VPIntrin, SmallVector<SDValue, 7> &OpValues, - bool isScatter) { + bool IsScatter) { SDLoc DL = getCurSDLoc(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); Value *PtrOperand = VPIntrin.getArgOperand(1); EVT VT = OpValues[0].getValueType(); - MaybeAlign Alignment = DAG.getEVTAlign(VT); + MaybeAlign Alignment = VPIntrin.getPointerAlignment(); + if (!Alignment) + Alignment = DAG.getEVTAlign(VT); AAMDNodes AAInfo = VPIntrin.getAAMetadata(); SDValue ST; - if (!isScatter) { + if (!IsScatter) { MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore, - VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo); + MemoryLocation::UnknownSize, *Alignment, AAInfo); ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], OpValues[1], OpValues[2], OpValues[3], MMO, false /* IsTruncating */); @@ -7690,8 +7691,9 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput), PointerType::getUnqual(LoadTy)); - if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr( - const_cast<Constant *>(LoadInput), LoadTy, *Builder.DL)) + if (const Constant *LoadCst = + ConstantFoldLoadFromConstPtr(const_cast<Constant *>(LoadInput), + LoadTy, Builder.DAG.getDataLayout())) return Builder.getValue(LoadCst); } @@ -9646,8 +9648,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // We push in swifterror return as the last element of CLI.Ins. ArgListTy &Args = CLI.getArgs(); if (supportSwiftError()) { - for (unsigned i = 0, e = Args.size(); i != e; ++i) { - if (Args[i].IsSwiftError) { + for (const ArgListEntry &Arg : Args) { + if (Arg.IsSwiftError) { ISD::InputArg MyFlags; MyFlags.VT = getPointerTy(DL); MyFlags.ArgVT = EVT(getPointerTy(DL)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index d6122aa0a739..ea48042a5dcf 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -190,7 +190,6 @@ public: static const unsigned LowestSDNodeOrder = 1; SelectionDAG &DAG; - const DataLayout *DL = nullptr; AAResults *AA = nullptr; const TargetLibraryInfo *LibInfo; @@ -568,9 +567,9 @@ private: void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); void visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT, - SmallVector<SDValue, 7> &OpValues, bool isGather); + SmallVector<SDValue, 7> &OpValues, bool IsGather); void visitVPStoreScatter(const VPIntrinsic &VPIntrin, - SmallVector<SDValue, 7> &OpValues, bool isScatter); + SmallVector<SDValue, 7> &OpValues, bool IsScatter); void visitVectorPredicationIntrinsic(const VPIntrinsic &VPIntrin); void visitVAStart(const CallInst &I); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index c7e37cf8ca14..77e11b364588 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -297,7 +297,7 @@ TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, #ifndef NDEBUG dbgs() << "If a target marks an instruction with " "'usesCustomInserter', it must implement " - "TargetLowering::EmitInstrWithCustomInserter!"; + "TargetLowering::EmitInstrWithCustomInserter!\n"; #endif llvm_unreachable(nullptr); } @@ -1784,27 +1784,25 @@ SelectionDAGISel::FinishBasicBlock() { } // Update PHI Nodes - for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size(); - pi != pe; ++pi) { - MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first); + for (const std::pair<MachineInstr *, unsigned> &P : + FuncInfo->PHINodesToUpdate) { + MachineInstrBuilder PHI(*MF, P.first); MachineBasicBlock *PHIBB = PHI->getParent(); assert(PHI->isPHI() && "This is not a machine PHI node that we are updating!"); // This is "default" BB. We have two jumps to it. From "header" BB and // from last "case" BB, unless the latter was skipped. if (PHIBB == BTB.Default) { - PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(BTB.Parent); + PHI.addReg(P.second).addMBB(BTB.Parent); if (!BTB.ContiguousRange) { - PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second) - .addMBB(BTB.Cases.back().ThisBB); + PHI.addReg(P.second).addMBB(BTB.Cases.back().ThisBB); } } // One of "cases" BB. - for (unsigned j = 0, ej = BTB.Cases.size(); - j != ej; ++j) { - MachineBasicBlock* cBB = BTB.Cases[j].ThisBB; + for (const SwitchCG::BitTestCase &BT : BTB.Cases) { + MachineBasicBlock* cBB = BT.ThisBB; if (cBB->isSuccessor(PHIBB)) - PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(cBB); + PHI.addReg(P.second).addMBB(cBB); } } } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 737695b5eabe..e6b06ab93d6b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3136,6 +3136,19 @@ bool TargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, return false; } +bool TargetLowering::isSplatValueForTargetNode(SDValue Op, + const APInt &DemandedElts, + APInt &UndefElts, + unsigned Depth) const { + assert((Op.getOpcode() >= ISD::BUILTIN_OP_END || + Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_W_CHAIN || + Op.getOpcode() == ISD::INTRINSIC_VOID) && + "Should use isSplatValue if you don't know whether Op" + " is a target node!"); + return false; +} + // FIXME: Ideally, this would use ISD::isConstantSplatVector(), but that must // work with truncating build vectors and vectors with elements of less than // 8 bits. @@ -4853,13 +4866,9 @@ TargetLowering::ParseConstraints(const DataLayout &DL, } // Now select chosen alternative in each constraint. - for (unsigned cIndex = 0, eIndex = ConstraintOperands.size(); - cIndex != eIndex; ++cIndex) { - AsmOperandInfo &cInfo = ConstraintOperands[cIndex]; - if (cInfo.Type == InlineAsm::isClobber) - continue; - cInfo.selectAlternative(bestMAIndex); - } + for (AsmOperandInfo &cInfo : ConstraintOperands) + if (cInfo.Type != InlineAsm::isClobber) + cInfo.selectAlternative(bestMAIndex); } } @@ -4927,9 +4936,9 @@ TargetLowering::ConstraintWeight ConstraintWeight BestWeight = CW_Invalid; // Loop over the options, keeping track of the most general one. - for (unsigned i = 0, e = rCodes->size(); i != e; ++i) { + for (const std::string &rCode : *rCodes) { ConstraintWeight weight = - getSingleConstraintMatchWeight(info, (*rCodes)[i].c_str()); + getSingleConstraintMatchWeight(info, rCode.c_str()); if (weight > BestWeight) BestWeight = weight; } @@ -6550,15 +6559,15 @@ static bool isNonZeroModBitWidthOrUndef(SDValue Z, unsigned BW) { true); } -bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result, - SelectionDAG &DAG) const { +SDValue TargetLowering::expandFunnelShift(SDNode *Node, + SelectionDAG &DAG) const { EVT VT = Node->getValueType(0); if (VT.isVector() && (!isOperationLegalOrCustom(ISD::SHL, VT) || !isOperationLegalOrCustom(ISD::SRL, VT) || !isOperationLegalOrCustom(ISD::SUB, VT) || !isOperationLegalOrCustomOrPromote(ISD::OR, VT))) - return false; + return SDValue(); SDValue X = Node->getOperand(0); SDValue Y = Node->getOperand(1); @@ -6592,8 +6601,7 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result, } Z = DAG.getNOT(DL, Z, ShVT); } - Result = DAG.getNode(RevOpcode, DL, VT, X, Y, Z); - return true; + return DAG.getNode(RevOpcode, DL, VT, X, Y, Z); } SDValue ShX, ShY; @@ -6633,13 +6641,12 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result, ShY = DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt); } } - Result = DAG.getNode(ISD::OR, DL, VT, ShX, ShY); - return true; + return DAG.getNode(ISD::OR, DL, VT, ShX, ShY); } // TODO: Merge with expandFunnelShift. -bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps, - SDValue &Result, SelectionDAG &DAG) const { +SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps, + SelectionDAG &DAG) const { EVT VT = Node->getValueType(0); unsigned EltSizeInBits = VT.getScalarSizeInBits(); bool IsLeft = Node->getOpcode() == ISD::ROTL; @@ -6650,12 +6657,12 @@ bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps, EVT ShVT = Op1.getValueType(); SDValue Zero = DAG.getConstant(0, DL, ShVT); - // If a rotate in the other direction is supported, use it. + // If a rotate in the other direction is more supported, use it. unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL; - if (isOperationLegalOrCustom(RevRot, VT) && isPowerOf2_32(EltSizeInBits)) { + if (!isOperationLegalOrCustom(Node->getOpcode(), VT) && + isOperationLegalOrCustom(RevRot, VT) && isPowerOf2_32(EltSizeInBits)) { SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1); - Result = DAG.getNode(RevRot, DL, VT, Op0, Sub); - return true; + return DAG.getNode(RevRot, DL, VT, Op0, Sub); } if (!AllowVectorOps && VT.isVector() && @@ -6664,7 +6671,7 @@ bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps, !isOperationLegalOrCustom(ISD::SUB, VT) || !isOperationLegalOrCustomOrPromote(ISD::OR, VT) || !isOperationLegalOrCustomOrPromote(ISD::AND, VT))) - return false; + return SDValue(); unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL; unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL; @@ -6690,8 +6697,7 @@ bool TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps, HsVal = DAG.getNode(HsOpc, DL, VT, DAG.getNode(HsOpc, DL, VT, Op0, One), HsAmt); } - Result = DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal); - return true; + return DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal); } void TargetLowering::expandShiftParts(SDNode *Node, SDValue &Lo, SDValue &Hi, @@ -8048,7 +8054,8 @@ SDValue TargetLowering::expandIntMINMAX(SDNode *Node, SelectionDAG &DAG) const { if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) return DAG.UnrollVectorOp(Node); - SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC); + EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue Cond = DAG.getSetCC(DL, BoolVT, Op0, Op1, CC); return DAG.getSelect(DL, VT, Cond, Op0, Op1); } diff --git a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp index 86b559fd6413..43a54ce33bf0 100644 --- a/llvm/lib/CodeGen/ShadowStackGCLowering.cpp +++ b/llvm/lib/CodeGen/ShadowStackGCLowering.cpp @@ -162,8 +162,8 @@ Type *ShadowStackGCLowering::GetConcreteStackEntryType(Function &F) { // doInitialization creates the generic version of this type. std::vector<Type *> EltTys; EltTys.push_back(StackEntryTy); - for (size_t I = 0; I != Roots.size(); I++) - EltTys.push_back(Roots[I].second->getAllocatedType()); + for (const std::pair<CallInst *, AllocaInst *> &Root : Roots) + EltTys.push_back(Root.second->getAllocatedType()); return StructType::create(EltTys, ("gc_stackentry." + F.getName()).str()); } @@ -240,8 +240,8 @@ void ShadowStackGCLowering::CollectRoots(Function &F) { SmallVector<std::pair<CallInst *, AllocaInst *>, 16> MetaRoots; for (BasicBlock &BB : F) - for (BasicBlock::iterator II = BB.begin(), E = BB.end(); II != E;) - if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) + for (Instruction &I : BB) + if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(&I)) if (Function *F = CI->getCalledFunction()) if (F->getIntrinsicID() == Intrinsic::gcroot) { std::pair<CallInst *, AllocaInst *> Pair = std::make_pair( @@ -377,9 +377,9 @@ bool ShadowStackGCLowering::runOnFunction(Function &F) { // Delete the original allocas (which are no longer used) and the intrinsic // calls (which are no longer valid). Doing this last avoids invalidating // iterators. - for (unsigned I = 0, E = Roots.size(); I != E; ++I) { - Roots[I].first->eraseFromParent(); - Roots[I].second->eraseFromParent(); + for (std::pair<CallInst *, AllocaInst *> &Root : Roots) { + Root.first->eraseFromParent(); + Root.second->eraseFromParent(); } Roots.clear(); diff --git a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp index 5ccfacfc26dc..3640296adbca 100644 --- a/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp +++ b/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp @@ -131,15 +131,15 @@ bool StackMapLiveness::calculateLiveness(MachineFunction &MF) { bool HasStackMap = false; // Reverse iterate over all instructions and add the current live register // set to an instruction if we encounter a patchpoint instruction. - for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) { - if (I->getOpcode() == TargetOpcode::PATCHPOINT) { - addLiveOutSetToMI(MF, *I); + for (MachineInstr &MI : llvm::reverse(MBB)) { + if (MI.getOpcode() == TargetOpcode::PATCHPOINT) { + addLiveOutSetToMI(MF, MI); HasChanged = true; HasStackMap = true; ++NumStackMaps; } - LLVM_DEBUG(dbgs() << " " << LiveRegs << " " << *I); - LiveRegs.stepBackward(*I); + LLVM_DEBUG(dbgs() << " " << LiveRegs << " " << MI); + LiveRegs.stepBackward(MI); } ++NumBBsVisited; if (!HasStackMap) diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index 7445f77c955d..6765fd274686 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -162,7 +162,7 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge, } bool StackProtector::HasAddressTaken(const Instruction *AI, - uint64_t AllocSize) { + TypeSize AllocSize) { const DataLayout &DL = M->getDataLayout(); for (const User *U : AI->users()) { const auto *I = cast<Instruction>(U); @@ -170,7 +170,8 @@ bool StackProtector::HasAddressTaken(const Instruction *AI, // the bounds of the allocated object. Optional<MemoryLocation> MemLoc = MemoryLocation::getOrNone(I); if (MemLoc.hasValue() && MemLoc->Size.hasValue() && - MemLoc->Size.getValue() > AllocSize) + !TypeSize::isKnownGE(AllocSize, + TypeSize::getFixed(MemLoc->Size.getValue()))) return true; switch (I->getOpcode()) { case Instruction::Store: @@ -203,13 +204,19 @@ bool StackProtector::HasAddressTaken(const Instruction *AI, // would use it could also be out-of-bounds meaning stack protection is // required. const GetElementPtrInst *GEP = cast<GetElementPtrInst>(I); - unsigned TypeSize = DL.getIndexTypeSizeInBits(I->getType()); - APInt Offset(TypeSize, 0); - APInt MaxOffset(TypeSize, AllocSize); - if (!GEP->accumulateConstantOffset(DL, Offset) || Offset.ugt(MaxOffset)) + unsigned IndexSize = DL.getIndexTypeSizeInBits(I->getType()); + APInt Offset(IndexSize, 0); + if (!GEP->accumulateConstantOffset(DL, Offset)) + return true; + TypeSize OffsetSize = TypeSize::Fixed(Offset.getLimitedValue()); + if (!TypeSize::isKnownGT(AllocSize, OffsetSize)) return true; // Adjust AllocSize to be the space remaining after this offset. - if (HasAddressTaken(I, AllocSize - Offset.getLimitedValue())) + // We can't subtract a fixed size from a scalable one, so in that case + // assume the scalable value is of minimum size. + TypeSize NewAllocSize = + TypeSize::Fixed(AllocSize.getKnownMinValue()) - OffsetSize; + if (HasAddressTaken(I, NewAllocSize)) return true; break; } diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp index f49ba5ccd447..17e6f51d0899 100644 --- a/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -325,8 +325,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "Color spill slot intervals:\n"); bool Changed = false; - for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) { - LiveInterval *li = SSIntervals[i]; + for (LiveInterval *li : SSIntervals) { int SS = Register::stackSlot2Index(li->reg()); int NewSS = ColorSlot(li); assert(NewSS >= 0 && "Stack coloring failed?"); @@ -338,8 +337,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) { } LLVM_DEBUG(dbgs() << "\nSpill slots after coloring:\n"); - for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) { - LiveInterval *li = SSIntervals[i]; + for (LiveInterval *li : SSIntervals) { int SS = Register::stackSlot2Index(li->reg()); li->setWeight(SlotWeights[SS]); } @@ -347,8 +345,8 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) { llvm::stable_sort(SSIntervals, IntervalSorter()); #ifndef NDEBUG - for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) - LLVM_DEBUG(SSIntervals[i]->dump()); + for (LiveInterval *li : SSIntervals) + LLVM_DEBUG(li->dump()); LLVM_DEBUG(dbgs() << '\n'); #endif diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index 54fc6ee45d00..68a7b80d6146 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -207,35 +207,34 @@ bool TailDuplicator::tailDuplicateAndUpdate( // Add the new vregs as available values. DenseMap<Register, AvailableValsTy>::iterator LI = SSAUpdateVals.find(VReg); - for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) { - MachineBasicBlock *SrcBB = LI->second[j].first; - Register SrcReg = LI->second[j].second; + for (std::pair<MachineBasicBlock *, Register> &J : LI->second) { + MachineBasicBlock *SrcBB = J.first; + Register SrcReg = J.second; SSAUpdate.AddAvailableValue(SrcBB, SrcReg); } + SmallVector<MachineOperand *> DebugUses; // Rewrite uses that are outside of the original def's block. - MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg); - // Only remove instructions after loop, as DBG_VALUE_LISTs with multiple - // uses of VReg may invalidate the use iterator when erased. - SmallPtrSet<MachineInstr *, 4> InstrsToRemove; - while (UI != MRI->use_end()) { - MachineOperand &UseMO = *UI; + for (MachineOperand &UseMO : + llvm::make_early_inc_range(MRI->use_operands(VReg))) { MachineInstr *UseMI = UseMO.getParent(); - ++UI; + // Rewrite debug uses last so that they can take advantage of any + // register mappings introduced by other users in its BB, since we + // cannot create new register definitions specifically for the debug + // instruction (as debug instructions should not affect CodeGen). if (UseMI->isDebugValue()) { - // SSAUpdate can replace the use with an undef. That creates - // a debug instruction that is a kill. - // FIXME: Should it SSAUpdate job to delete debug instructions - // instead of replacing the use with undef? - InstrsToRemove.insert(UseMI); + DebugUses.push_back(&UseMO); continue; } if (UseMI->getParent() == DefBB && !UseMI->isPHI()) continue; SSAUpdate.RewriteUse(UseMO); } - for (auto *MI : InstrsToRemove) - MI->eraseFromParent(); + for (auto *UseMO : DebugUses) { + MachineInstr *UseMI = UseMO->getParent(); + UseMO->setReg( + SSAUpdate.GetValueInMiddleOfBlock(UseMI->getParent(), true)); + } } SSAUpdateVRs.clear(); @@ -511,8 +510,8 @@ void TailDuplicator::updateSuccessorsPHIs( SSAUpdateVals.find(Reg); if (LI != SSAUpdateVals.end()) { // This register is defined in the tail block. - for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) { - MachineBasicBlock *SrcBB = LI->second[j].first; + for (const std::pair<MachineBasicBlock *, Register> &J : LI->second) { + MachineBasicBlock *SrcBB = J.first; // If we didn't duplicate a bb into a particular predecessor, we // might still have added an entry to SSAUpdateVals to correcly // recompute SSA. If that case, avoid adding a dummy extra argument @@ -520,7 +519,7 @@ void TailDuplicator::updateSuccessorsPHIs( if (!SrcBB->isSuccessor(SuccBB)) continue; - Register SrcReg = LI->second[j].second; + Register SrcReg = J.second; if (Idx != 0) { MI.getOperand(Idx).setReg(SrcReg); MI.getOperand(Idx + 1).setMBB(SrcBB); @@ -531,8 +530,7 @@ void TailDuplicator::updateSuccessorsPHIs( } } else { // Live in tail block, must also be live in predecessors. - for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) { - MachineBasicBlock *SrcBB = TDBBs[j]; + for (MachineBasicBlock *SrcBB : TDBBs) { if (Idx != 0) { MI.getOperand(Idx).setReg(Reg); MI.getOperand(Idx + 1).setMBB(SrcBB); diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 5119dac36713..3f22cc4289f2 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -436,7 +436,7 @@ MachineInstr &TargetInstrInfo::duplicate(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, const MachineInstr &Orig) const { assert(!Orig.isNotDuplicable() && "Instruction cannot be duplicated"); MachineFunction &MF = *MBB.getParent(); - return MF.CloneMachineInstrBundle(MBB, InsertBefore, Orig); + return MF.cloneMachineInstrBundle(MBB, InsertBefore, Orig); } // If the COPY instruction in MI can be folded to a stack operation, return @@ -1418,3 +1418,16 @@ void TargetInstrInfo::mergeOutliningCandidateAttributes( })) F.addFnAttr(Attribute::NoUnwind); } + +bool TargetInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, + unsigned &Flags) const { + // Some instrumentations create special TargetOpcode at the start which + // expands to special code sequences which must be present. + auto First = MBB.getFirstNonDebugInstr(); + if (First != MBB.end() && + (First->getOpcode() == TargetOpcode::FENTRY_CALL || + First->getOpcode() == TargetOpcode::PATCHABLE_FUNCTION_ENTER)) + return false; + + return true; +} diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index c0a7efff9e98..6fc6881f8736 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1187,7 +1187,7 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI, // all stack slots), but we need to handle the different type of stackmap // operands and memory effects here. - if (!llvm::any_of(MI->operands(), + if (llvm::none_of(MI->operands(), [](MachineOperand &Operand) { return Operand.isFI(); })) return MBB; diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp index f4bb71535f7f..f5cb518fce3e 100644 --- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp +++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp @@ -248,8 +248,8 @@ static void getAllocatableSetForRC(const MachineFunction &MF, const TargetRegisterClass *RC, BitVector &R){ assert(RC->isAllocatable() && "invalid for nonallocatable sets"); ArrayRef<MCPhysReg> Order = RC->getRawAllocationOrder(MF); - for (unsigned i = 0; i != Order.size(); ++i) - R.set(Order[i]); + for (MCPhysReg PR : Order) + R.set(PR); } BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF, diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp index c9a19948ff2f..3426a03b6083 100644 --- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp +++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp @@ -144,23 +144,22 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { } // Actually remove the blocks now. - for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) { + for (MachineBasicBlock *BB : DeadBlocks) { // Remove any call site information for calls in the block. - for (auto &I : DeadBlocks[i]->instrs()) + for (auto &I : BB->instrs()) if (I.shouldUpdateCallSiteInfo()) - DeadBlocks[i]->getParent()->eraseCallSiteInfo(&I); + BB->getParent()->eraseCallSiteInfo(&I); - DeadBlocks[i]->eraseFromParent(); + BB->eraseFromParent(); } // Cleanup PHI nodes. - for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) { - MachineBasicBlock *BB = &*I; + for (MachineBasicBlock &BB : F) { // Prune unneeded PHI entries. - SmallPtrSet<MachineBasicBlock*, 8> preds(BB->pred_begin(), - BB->pred_end()); - MachineBasicBlock::iterator phi = BB->begin(); - while (phi != BB->end() && phi->isPHI()) { + SmallPtrSet<MachineBasicBlock*, 8> preds(BB.pred_begin(), + BB.pred_end()); + MachineBasicBlock::iterator phi = BB.begin(); + while (phi != BB.end() && phi->isPHI()) { for (unsigned i = phi->getNumOperands() - 1; i >= 2; i-=2) if (!preds.count(phi->getOperand(i).getMBB())) { phi->RemoveOperand(i); @@ -189,7 +188,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { // insert a COPY instead of simply replacing the output // with the input. const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo(); - BuildMI(*BB, BB->getFirstNonPHI(), phi->getDebugLoc(), + BuildMI(BB, BB.getFirstNonPHI(), phi->getDebugLoc(), TII->get(TargetOpcode::COPY), OutputReg) .addReg(InputReg, getRegState(Input), InputSub); } diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp new file mode 100644 index 000000000000..cbc5d9ec169b --- /dev/null +++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp @@ -0,0 +1,1009 @@ +//===- VLIWMachineScheduler.cpp - VLIW-Focused Scheduling Pass ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// MachineScheduler schedules machine instructions after phi elimination. It +// preserves LiveIntervals so it can be invoked before register allocation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/VLIWMachineScheduler.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/RegisterClassInfo.h" +#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleHazardRecognizer.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <iomanip> +#include <limits> +#include <memory> +#include <sstream> + +using namespace llvm; + +#define DEBUG_TYPE "machine-scheduler" + +static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure", cl::Hidden, + cl::ZeroOrMore, cl::init(false)); + +static cl::opt<bool> UseNewerCandidate("use-newer-candidate", cl::Hidden, + cl::ZeroOrMore, cl::init(true)); + +static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level", + cl::Hidden, cl::ZeroOrMore, + cl::init(1)); + +// Check if the scheduler should penalize instructions that are available to +// early due to a zero-latency dependence. +static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden, + cl::ZeroOrMore, cl::init(true)); + +// This value is used to determine if a register class is a high pressure set. +// We compute the maximum number of registers needed and divided by the total +// available. Then, we compare the result to this value. +static cl::opt<float> RPThreshold("vliw-misched-reg-pressure", cl::Hidden, + cl::init(0.75f), + cl::desc("High register pressure threhold.")); + +VLIWResourceModel::VLIWResourceModel(const TargetSubtargetInfo &STI, + const TargetSchedModel *SM) + : TII(STI.getInstrInfo()), SchedModel(SM) { + ResourcesModel = createPacketizer(STI); + + // This hard requirement could be relaxed, + // but for now do not let it proceed. + assert(ResourcesModel && "Unimplemented CreateTargetScheduleState."); + + Packet.reserve(SchedModel->getIssueWidth()); + Packet.clear(); + ResourcesModel->clearResources(); +} + +void VLIWResourceModel::reset() { + Packet.clear(); + ResourcesModel->clearResources(); +} + +VLIWResourceModel::~VLIWResourceModel() { delete ResourcesModel; } + +/// Return true if there is a dependence between SUd and SUu. +bool VLIWResourceModel::hasDependence(const SUnit *SUd, const SUnit *SUu) { + if (SUd->Succs.size() == 0) + return false; + + for (const auto &S : SUd->Succs) { + // Since we do not add pseudos to packets, might as well + // ignore order dependencies. + if (S.isCtrl()) + continue; + + if (S.getSUnit() == SUu && S.getLatency() > 0) + return true; + } + return false; +} + +/// Check if scheduling of this SU is possible +/// in the current packet. +/// It is _not_ precise (statefull), it is more like +/// another heuristic. Many corner cases are figured +/// empirically. +bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) { + if (!SU || !SU->getInstr()) + return false; + + // First see if the pipeline could receive this instruction + // in the current cycle. + switch (SU->getInstr()->getOpcode()) { + default: + if (!ResourcesModel->canReserveResources(*SU->getInstr())) + return false; + break; + case TargetOpcode::EXTRACT_SUBREG: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + case TargetOpcode::REG_SEQUENCE: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::COPY: + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: + break; + } + + // Now see if there are no other dependencies to instructions already + // in the packet. + if (IsTop) { + for (unsigned i = 0, e = Packet.size(); i != e; ++i) + if (hasDependence(Packet[i], SU)) + return false; + } else { + for (unsigned i = 0, e = Packet.size(); i != e; ++i) + if (hasDependence(SU, Packet[i])) + return false; + } + return true; +} + +/// Keep track of available resources. +bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) { + bool startNewCycle = false; + // Artificially reset state. + if (!SU) { + reset(); + TotalPackets++; + return false; + } + // If this SU does not fit in the packet or the packet is now full + // start a new one. + if (!isResourceAvailable(SU, IsTop) || + Packet.size() >= SchedModel->getIssueWidth()) { + reset(); + TotalPackets++; + startNewCycle = true; + } + + switch (SU->getInstr()->getOpcode()) { + default: + ResourcesModel->reserveResources(*SU->getInstr()); + break; + case TargetOpcode::EXTRACT_SUBREG: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + case TargetOpcode::REG_SEQUENCE: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::CFI_INSTRUCTION: + case TargetOpcode::EH_LABEL: + case TargetOpcode::COPY: + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: + break; + } + Packet.push_back(SU); + +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n"); + for (unsigned i = 0, e = Packet.size(); i != e; ++i) { + LLVM_DEBUG(dbgs() << "\t[" << i << "] SU("); + LLVM_DEBUG(dbgs() << Packet[i]->NodeNum << ")\t"); + LLVM_DEBUG(Packet[i]->getInstr()->dump()); + } +#endif + + return startNewCycle; +} + +DFAPacketizer * +VLIWResourceModel::createPacketizer(const TargetSubtargetInfo &STI) const { + return STI.getInstrInfo()->CreateTargetScheduleState(STI); +} + +/// schedule - Called back from MachineScheduler::runOnMachineFunction +/// after setting up the current scheduling region. [RegionBegin, RegionEnd) +/// only includes instructions that have DAG nodes, not scheduling boundaries. +void VLIWMachineScheduler::schedule() { + LLVM_DEBUG(dbgs() << "********** MI Converging Scheduling VLIW " + << printMBBReference(*BB) << " " << BB->getName() + << " in_func " << BB->getParent()->getName() + << " at loop depth " << MLI->getLoopDepth(BB) << " \n"); + + buildDAGWithRegPressure(); + + Topo.InitDAGTopologicalSorting(); + + // Postprocess the DAG to add platform-specific artificial dependencies. + postprocessDAG(); + + SmallVector<SUnit *, 8> TopRoots, BotRoots; + findRootsAndBiasEdges(TopRoots, BotRoots); + + // Initialize the strategy before modifying the DAG. + SchedImpl->initialize(this); + + LLVM_DEBUG({ + unsigned maxH = 0; + for (const SUnit &SU : SUnits) + if (SU.getHeight() > maxH) + maxH = SU.getHeight(); + dbgs() << "Max Height " << maxH << "\n"; + }); + LLVM_DEBUG({ + unsigned maxD = 0; + for (const SUnit &SU : SUnits) + if (SU.getDepth() > maxD) + maxD = SU.getDepth(); + dbgs() << "Max Depth " << maxD << "\n"; + }); + LLVM_DEBUG(dump()); + if (ViewMISchedDAGs) + viewGraph(); + + initQueues(TopRoots, BotRoots); + + bool IsTopNode = false; + while (true) { + LLVM_DEBUG( + dbgs() << "** VLIWMachineScheduler::schedule picking next node\n"); + SUnit *SU = SchedImpl->pickNode(IsTopNode); + if (!SU) + break; + + if (!checkSchedLimit()) + break; + + scheduleMI(SU, IsTopNode); + + // Notify the scheduling strategy after updating the DAG. + SchedImpl->schedNode(SU, IsTopNode); + + updateQueues(SU, IsTopNode); + } + assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); + + placeDebugValues(); + + LLVM_DEBUG({ + dbgs() << "*** Final schedule for " + << printMBBReference(*begin()->getParent()) << " ***\n"; + dumpSchedule(); + dbgs() << '\n'; + }); +} + +void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) { + DAG = static_cast<VLIWMachineScheduler *>(dag); + SchedModel = DAG->getSchedModel(); + + Top.init(DAG, SchedModel); + Bot.init(DAG, SchedModel); + + // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or + // are disabled, then these HazardRecs will be disabled. + const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries(); + const TargetSubtargetInfo &STI = DAG->MF.getSubtarget(); + const TargetInstrInfo *TII = STI.getInstrInfo(); + delete Top.HazardRec; + delete Bot.HazardRec; + Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG); + Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG); + + delete Top.ResourceModel; + delete Bot.ResourceModel; + Top.ResourceModel = createVLIWResourceModel(STI, DAG->getSchedModel()); + Bot.ResourceModel = createVLIWResourceModel(STI, DAG->getSchedModel()); + + const std::vector<unsigned> &MaxPressure = + DAG->getRegPressure().MaxSetPressure; + HighPressureSets.assign(MaxPressure.size(), 0); + for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) { + unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i); + HighPressureSets[i] = + ((float)MaxPressure[i] > ((float)Limit * RPThreshold)); + } + + assert((!ForceTopDown || !ForceBottomUp) && + "-misched-topdown incompatible with -misched-bottomup"); +} + +VLIWResourceModel *ConvergingVLIWScheduler::createVLIWResourceModel( + const TargetSubtargetInfo &STI, const TargetSchedModel *SchedModel) const { + return new VLIWResourceModel(STI, SchedModel); +} + +void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) { + for (const SDep &PI : SU->Preds) { + unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle; + unsigned MinLatency = PI.getLatency(); +#ifndef NDEBUG + Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency); +#endif + if (SU->TopReadyCycle < PredReadyCycle + MinLatency) + SU->TopReadyCycle = PredReadyCycle + MinLatency; + } + + if (!SU->isScheduled) + Top.releaseNode(SU, SU->TopReadyCycle); +} + +void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) { + assert(SU->getInstr() && "Scheduled SUnit must have instr"); + + for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); I != E; + ++I) { + unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; + unsigned MinLatency = I->getLatency(); +#ifndef NDEBUG + Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency); +#endif + if (SU->BotReadyCycle < SuccReadyCycle + MinLatency) + SU->BotReadyCycle = SuccReadyCycle + MinLatency; + } + + if (!SU->isScheduled) + Bot.releaseNode(SU, SU->BotReadyCycle); +} + +ConvergingVLIWScheduler::VLIWSchedBoundary::~VLIWSchedBoundary() { + delete ResourceModel; + delete HazardRec; +} + +/// Does this SU have a hazard within the current instruction group. +/// +/// The scheduler supports two modes of hazard recognition. The first is the +/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that +/// supports highly complicated in-order reservation tables +/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic. +/// +/// The second is a streamlined mechanism that checks for hazards based on +/// simple counters that the scheduler itself maintains. It explicitly checks +/// for instruction dispatch limitations, including the number of micro-ops that +/// can dispatch per cycle. +/// +/// TODO: Also check whether the SU must start a new group. +bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) { + if (HazardRec->isEnabled()) + return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; + + unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); + if (IssueCount + uops > SchedModel->getIssueWidth()) + return true; + + return false; +} + +void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode( + SUnit *SU, unsigned ReadyCycle) { + if (ReadyCycle < MinReadyCycle) + MinReadyCycle = ReadyCycle; + + // Check for interlocks first. For the purpose of other heuristics, an + // instruction that cannot issue appears as if it's not in the ReadyQueue. + if (ReadyCycle > CurrCycle || checkHazard(SU)) + + Pending.push(SU); + else + Available.push(SU); +} + +/// Move the boundary of scheduled code by one cycle. +void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() { + unsigned Width = SchedModel->getIssueWidth(); + IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width; + + assert(MinReadyCycle < std::numeric_limits<unsigned>::max() && + "MinReadyCycle uninitialized"); + unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle); + + if (!HazardRec->isEnabled()) { + // Bypass HazardRec virtual calls. + CurrCycle = NextCycle; + } else { + // Bypass getHazardType calls in case of long latency. + for (; CurrCycle != NextCycle; ++CurrCycle) { + if (isTop()) + HazardRec->AdvanceCycle(); + else + HazardRec->RecedeCycle(); + } + } + CheckPending = true; + + LLVM_DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle " + << CurrCycle << '\n'); +} + +/// Move the boundary of scheduled code by one SUnit. +void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) { + bool startNewCycle = false; + + // Update the reservation table. + if (HazardRec->isEnabled()) { + if (!isTop() && SU->isCall) { + // Calls are scheduled with their preceding instructions. For bottom-up + // scheduling, clear the pipeline state before emitting. + HazardRec->Reset(); + } + HazardRec->EmitInstruction(SU); + } + + // Update DFA model. + startNewCycle = ResourceModel->reserveResources(SU, isTop()); + + // Check the instruction group dispatch limit. + // TODO: Check if this SU must end a dispatch group. + IssueCount += SchedModel->getNumMicroOps(SU->getInstr()); + if (startNewCycle) { + LLVM_DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n'); + bumpCycle(); + } else + LLVM_DEBUG(dbgs() << "*** IssueCount " << IssueCount << " at cycle " + << CurrCycle << '\n'); +} + +/// Release pending ready nodes in to the available queue. This makes them +/// visible to heuristics. +void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() { + // If the available queue is empty, it is safe to reset MinReadyCycle. + if (Available.empty()) + MinReadyCycle = std::numeric_limits<unsigned>::max(); + + // Check to see if any of the pending instructions are ready to issue. If + // so, add them to the available queue. + for (unsigned i = 0, e = Pending.size(); i != e; ++i) { + SUnit *SU = *(Pending.begin() + i); + unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; + + if (ReadyCycle < MinReadyCycle) + MinReadyCycle = ReadyCycle; + + if (ReadyCycle > CurrCycle) + continue; + + if (checkHazard(SU)) + continue; + + Available.push(SU); + Pending.remove(Pending.begin() + i); + --i; + --e; + } + CheckPending = false; +} + +/// Remove SU from the ready set for this boundary. +void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) { + if (Available.isInQueue(SU)) + Available.remove(Available.find(SU)); + else { + assert(Pending.isInQueue(SU) && "bad ready count"); + Pending.remove(Pending.find(SU)); + } +} + +/// If this queue only has one ready candidate, return it. As a side effect, +/// advance the cycle until at least one node is ready. If multiple instructions +/// are ready, return NULL. +SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() { + if (CheckPending) + releasePending(); + + auto AdvanceCycle = [this]() { + if (Available.empty()) + return true; + if (Available.size() == 1 && Pending.size() > 0) + return !ResourceModel->isResourceAvailable(*Available.begin(), isTop()) || + getWeakLeft(*Available.begin(), isTop()) != 0; + return false; + }; + for (unsigned i = 0; AdvanceCycle(); ++i) { + assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) && + "permanent hazard"); + (void)i; + ResourceModel->reserveResources(nullptr, isTop()); + bumpCycle(); + releasePending(); + } + if (Available.size() == 1) + return *Available.begin(); + return nullptr; +} + +#ifndef NDEBUG +void ConvergingVLIWScheduler::traceCandidate(const char *Label, + const ReadyQueue &Q, SUnit *SU, + int Cost, PressureChange P) { + dbgs() << Label << " " << Q.getName() << " "; + if (P.isValid()) + dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" + << P.getUnitInc() << " "; + else + dbgs() << " "; + dbgs() << "cost(" << Cost << ")\t"; + DAG->dumpNode(*SU); +} + +// Very detailed queue dump, to be used with higher verbosity levels. +void ConvergingVLIWScheduler::readyQueueVerboseDump( + const RegPressureTracker &RPTracker, SchedCandidate &Candidate, + ReadyQueue &Q) { + RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); + + dbgs() << ">>> " << Q.getName() << "\n"; + for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { + RegPressureDelta RPDelta; + TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, + DAG->getRegionCriticalPSets(), + DAG->getRegPressure().MaxSetPressure); + std::stringstream dbgstr; + dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")"; + dbgs() << dbgstr.str(); + SchedulingCost(Q, *I, Candidate, RPDelta, true); + dbgs() << "\t"; + (*I)->getInstr()->dump(); + } + dbgs() << "\n"; +} +#endif + +/// isSingleUnscheduledPred - If SU2 is the only unscheduled predecessor +/// of SU, return true (we may have duplicates) +static inline bool isSingleUnscheduledPred(SUnit *SU, SUnit *SU2) { + if (SU->NumPredsLeft == 0) + return false; + + for (auto &Pred : SU->Preds) { + // We found an available, but not scheduled, predecessor. + if (!Pred.getSUnit()->isScheduled && (Pred.getSUnit() != SU2)) + return false; + } + + return true; +} + +/// isSingleUnscheduledSucc - If SU2 is the only unscheduled successor +/// of SU, return true (we may have duplicates) +static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) { + if (SU->NumSuccsLeft == 0) + return false; + + for (auto &Succ : SU->Succs) { + // We found an available, but not scheduled, successor. + if (!Succ.getSUnit()->isScheduled && (Succ.getSUnit() != SU2)) + return false; + } + return true; +} + +/// Check if the instruction changes the register pressure of a register in the +/// high pressure set. The function returns a negative value if the pressure +/// decreases and a positive value is the pressure increases. If the instruction +/// doesn't use a high pressure register or doesn't change the register +/// pressure, then return 0. +int ConvergingVLIWScheduler::pressureChange(const SUnit *SU, bool isBotUp) { + PressureDiff &PD = DAG->getPressureDiff(SU); + for (auto &P : PD) { + if (!P.isValid()) + continue; + // The pressure differences are computed bottom-up, so the comparision for + // an increase is positive in the bottom direction, but negative in the + // top-down direction. + if (HighPressureSets[P.getPSet()]) + return (isBotUp ? P.getUnitInc() : -P.getUnitInc()); + } + return 0; +} + +/// Single point to compute overall scheduling cost. +/// TODO: More heuristics will be used soon. +int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU, + SchedCandidate &Candidate, + RegPressureDelta &Delta, + bool verbose) { + // Initial trivial priority. + int ResCount = 1; + + // Do not waste time on a node that is already scheduled. + if (!SU || SU->isScheduled) + return ResCount; + + LLVM_DEBUG(if (verbose) dbgs() + << ((Q.getID() == TopQID) ? "(top|" : "(bot|")); + // Forced priority is high. + if (SU->isScheduleHigh) { + ResCount += PriorityOne; + LLVM_DEBUG(dbgs() << "H|"); + } + + unsigned IsAvailableAmt = 0; + // Critical path first. + if (Q.getID() == TopQID) { + if (Top.isLatencyBound(SU)) { + LLVM_DEBUG(if (verbose) dbgs() << "LB|"); + ResCount += (SU->getHeight() * ScaleTwo); + } + + LLVM_DEBUG(if (verbose) { + std::stringstream dbgstr; + dbgstr << "h" << std::setw(3) << SU->getHeight() << "|"; + dbgs() << dbgstr.str(); + }); + + // If resources are available for it, multiply the + // chance of scheduling. + if (Top.ResourceModel->isResourceAvailable(SU, true)) { + IsAvailableAmt = (PriorityTwo + PriorityThree); + ResCount += IsAvailableAmt; + LLVM_DEBUG(if (verbose) dbgs() << "A|"); + } else + LLVM_DEBUG(if (verbose) dbgs() << " |"); + } else { + if (Bot.isLatencyBound(SU)) { + LLVM_DEBUG(if (verbose) dbgs() << "LB|"); + ResCount += (SU->getDepth() * ScaleTwo); + } + + LLVM_DEBUG(if (verbose) { + std::stringstream dbgstr; + dbgstr << "d" << std::setw(3) << SU->getDepth() << "|"; + dbgs() << dbgstr.str(); + }); + + // If resources are available for it, multiply the + // chance of scheduling. + if (Bot.ResourceModel->isResourceAvailable(SU, false)) { + IsAvailableAmt = (PriorityTwo + PriorityThree); + ResCount += IsAvailableAmt; + LLVM_DEBUG(if (verbose) dbgs() << "A|"); + } else + LLVM_DEBUG(if (verbose) dbgs() << " |"); + } + + unsigned NumNodesBlocking = 0; + if (Q.getID() == TopQID) { + // How many SUs does it block from scheduling? + // Look at all of the successors of this node. + // Count the number of nodes that + // this node is the sole unscheduled node for. + if (Top.isLatencyBound(SU)) + for (const SDep &SI : SU->Succs) + if (isSingleUnscheduledPred(SI.getSUnit(), SU)) + ++NumNodesBlocking; + } else { + // How many unscheduled predecessors block this node? + if (Bot.isLatencyBound(SU)) + for (const SDep &PI : SU->Preds) + if (isSingleUnscheduledSucc(PI.getSUnit(), SU)) + ++NumNodesBlocking; + } + ResCount += (NumNodesBlocking * ScaleTwo); + + LLVM_DEBUG(if (verbose) { + std::stringstream dbgstr; + dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|"; + dbgs() << dbgstr.str(); + }); + + // Factor in reg pressure as a heuristic. + if (!IgnoreBBRegPressure) { + // Decrease priority by the amount that register pressure exceeds the limit. + ResCount -= (Delta.Excess.getUnitInc() * PriorityOne); + // Decrease priority if register pressure exceeds the limit. + ResCount -= (Delta.CriticalMax.getUnitInc() * PriorityOne); + // Decrease priority slightly if register pressure would increase over the + // current maximum. + ResCount -= (Delta.CurrentMax.getUnitInc() * PriorityTwo); + // If there are register pressure issues, then we remove the value added for + // the instruction being available. The rationale is that we really don't + // want to schedule an instruction that causes a spill. + if (IsAvailableAmt && pressureChange(SU, Q.getID() != TopQID) > 0 && + (Delta.Excess.getUnitInc() || Delta.CriticalMax.getUnitInc() || + Delta.CurrentMax.getUnitInc())) + ResCount -= IsAvailableAmt; + LLVM_DEBUG(if (verbose) { + dbgs() << "RP " << Delta.Excess.getUnitInc() << "/" + << Delta.CriticalMax.getUnitInc() << "/" + << Delta.CurrentMax.getUnitInc() << ")|"; + }); + } + + // Give preference to a zero latency instruction if the dependent + // instruction is in the current packet. + if (Q.getID() == TopQID && getWeakLeft(SU, true) == 0) { + for (const SDep &PI : SU->Preds) { + if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() && + PI.getLatency() == 0 && + Top.ResourceModel->isInPacket(PI.getSUnit())) { + ResCount += PriorityThree; + LLVM_DEBUG(if (verbose) dbgs() << "Z|"); + } + } + } else if (Q.getID() == BotQID && getWeakLeft(SU, false) == 0) { + for (const SDep &SI : SU->Succs) { + if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() && + SI.getLatency() == 0 && + Bot.ResourceModel->isInPacket(SI.getSUnit())) { + ResCount += PriorityThree; + LLVM_DEBUG(if (verbose) dbgs() << "Z|"); + } + } + } + + // If the instruction has a non-zero latency dependence with an instruction in + // the current packet, then it should not be scheduled yet. The case occurs + // when the dependent instruction is scheduled in a new packet, so the + // scheduler updates the current cycle and pending instructions become + // available. + if (CheckEarlyAvail) { + if (Q.getID() == TopQID) { + for (const auto &PI : SU->Preds) { + if (PI.getLatency() > 0 && + Top.ResourceModel->isInPacket(PI.getSUnit())) { + ResCount -= PriorityOne; + LLVM_DEBUG(if (verbose) dbgs() << "D|"); + } + } + } else { + for (const auto &SI : SU->Succs) { + if (SI.getLatency() > 0 && + Bot.ResourceModel->isInPacket(SI.getSUnit())) { + ResCount -= PriorityOne; + LLVM_DEBUG(if (verbose) dbgs() << "D|"); + } + } + } + } + + LLVM_DEBUG(if (verbose) { + std::stringstream dbgstr; + dbgstr << "Total " << std::setw(4) << ResCount << ")"; + dbgs() << dbgstr.str(); + }); + + return ResCount; +} + +/// Pick the best candidate from the top queue. +/// +/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during +/// DAG building. To adjust for the current scheduling location we need to +/// maintain the number of vreg uses remaining to be top-scheduled. +ConvergingVLIWScheduler::CandResult +ConvergingVLIWScheduler::pickNodeFromQueue(VLIWSchedBoundary &Zone, + const RegPressureTracker &RPTracker, + SchedCandidate &Candidate) { + ReadyQueue &Q = Zone.Available; + LLVM_DEBUG(if (SchedDebugVerboseLevel > 1) + readyQueueVerboseDump(RPTracker, Candidate, Q); + else Q.dump();); + + // getMaxPressureDelta temporarily modifies the tracker. + RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); + + // BestSU remains NULL if no top candidates beat the best existing candidate. + CandResult FoundCandidate = NoCand; + for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { + RegPressureDelta RPDelta; + TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, + DAG->getRegionCriticalPSets(), + DAG->getRegPressure().MaxSetPressure); + + int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false); + + // Initialize the candidate if needed. + if (!Candidate.SU) { + LLVM_DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + Candidate.SCost = CurrentCost; + FoundCandidate = NodeOrder; + continue; + } + + // Choose node order for negative cost candidates. There is no good + // candidate in this case. + if (CurrentCost < 0 && Candidate.SCost < 0) { + if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) || + (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) { + LLVM_DEBUG(traceCandidate("NCAND", Q, *I, CurrentCost)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + Candidate.SCost = CurrentCost; + FoundCandidate = NodeOrder; + } + continue; + } + + // Best cost. + if (CurrentCost > Candidate.SCost) { + LLVM_DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + Candidate.SCost = CurrentCost; + FoundCandidate = BestCost; + continue; + } + + // Choose an instruction that does not depend on an artificial edge. + unsigned CurrWeak = getWeakLeft(*I, (Q.getID() == TopQID)); + unsigned CandWeak = getWeakLeft(Candidate.SU, (Q.getID() == TopQID)); + if (CurrWeak != CandWeak) { + if (CurrWeak < CandWeak) { + LLVM_DEBUG(traceCandidate("WCAND", Q, *I, CurrentCost)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + Candidate.SCost = CurrentCost; + FoundCandidate = Weak; + } + continue; + } + + if (CurrentCost == Candidate.SCost && Zone.isLatencyBound(*I)) { + unsigned CurrSize, CandSize; + if (Q.getID() == TopQID) { + CurrSize = (*I)->Succs.size(); + CandSize = Candidate.SU->Succs.size(); + } else { + CurrSize = (*I)->Preds.size(); + CandSize = Candidate.SU->Preds.size(); + } + if (CurrSize > CandSize) { + LLVM_DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + Candidate.SCost = CurrentCost; + FoundCandidate = BestCost; + } + // Keep the old candidate if it's a better candidate. That is, don't use + // the subsequent tie breaker. + if (CurrSize != CandSize) + continue; + } + + // Tie breaker. + // To avoid scheduling indeterminism, we need a tie breaker + // for the case when cost is identical for two nodes. + if (UseNewerCandidate && CurrentCost == Candidate.SCost) { + if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) || + (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) { + LLVM_DEBUG(traceCandidate("TCAND", Q, *I, CurrentCost)); + Candidate.SU = *I; + Candidate.RPDelta = RPDelta; + Candidate.SCost = CurrentCost; + FoundCandidate = NodeOrder; + continue; + } + } + + // Fall through to original instruction order. + // Only consider node order if Candidate was chosen from this Q. + if (FoundCandidate == NoCand) + continue; + } + return FoundCandidate; +} + +/// Pick the best candidate node from either the top or bottom queue. +SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) { + // Schedule as far as possible in the direction of no choice. This is most + // efficient, but also provides the best heuristics for CriticalPSets. + if (SUnit *SU = Bot.pickOnlyChoice()) { + LLVM_DEBUG(dbgs() << "Picked only Bottom\n"); + IsTopNode = false; + return SU; + } + if (SUnit *SU = Top.pickOnlyChoice()) { + LLVM_DEBUG(dbgs() << "Picked only Top\n"); + IsTopNode = true; + return SU; + } + SchedCandidate BotCand; + // Prefer bottom scheduling when heuristics are silent. + CandResult BotResult = + pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); + assert(BotResult != NoCand && "failed to find the first candidate"); + + // If either Q has a single candidate that provides the least increase in + // Excess pressure, we can immediately schedule from that Q. + // + // RegionCriticalPSets summarizes the pressure within the scheduled region and + // affects picking from either Q. If scheduling in one direction must + // increase pressure for one of the excess PSets, then schedule in that + // direction first to provide more freedom in the other direction. + if (BotResult == SingleExcess || BotResult == SingleCritical) { + LLVM_DEBUG(dbgs() << "Prefered Bottom Node\n"); + IsTopNode = false; + return BotCand.SU; + } + // Check if the top Q has a better candidate. + SchedCandidate TopCand; + CandResult TopResult = + pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); + assert(TopResult != NoCand && "failed to find the first candidate"); + + if (TopResult == SingleExcess || TopResult == SingleCritical) { + LLVM_DEBUG(dbgs() << "Prefered Top Node\n"); + IsTopNode = true; + return TopCand.SU; + } + // If either Q has a single candidate that minimizes pressure above the + // original region's pressure pick it. + if (BotResult == SingleMax) { + LLVM_DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n"); + IsTopNode = false; + return BotCand.SU; + } + if (TopResult == SingleMax) { + LLVM_DEBUG(dbgs() << "Prefered Top Node SingleMax\n"); + IsTopNode = true; + return TopCand.SU; + } + if (TopCand.SCost > BotCand.SCost) { + LLVM_DEBUG(dbgs() << "Prefered Top Node Cost\n"); + IsTopNode = true; + return TopCand.SU; + } + // Otherwise prefer the bottom candidate in node order. + LLVM_DEBUG(dbgs() << "Prefered Bottom in Node order\n"); + IsTopNode = false; + return BotCand.SU; +} + +/// Pick the best node to balance the schedule. Implements MachineSchedStrategy. +SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) { + if (DAG->top() == DAG->bottom()) { + assert(Top.Available.empty() && Top.Pending.empty() && + Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); + return nullptr; + } + SUnit *SU; + if (ForceTopDown) { + SU = Top.pickOnlyChoice(); + if (!SU) { + SchedCandidate TopCand; + CandResult TopResult = + pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); + assert(TopResult != NoCand && "failed to find the first candidate"); + (void)TopResult; + SU = TopCand.SU; + } + IsTopNode = true; + } else if (ForceBottomUp) { + SU = Bot.pickOnlyChoice(); + if (!SU) { + SchedCandidate BotCand; + CandResult BotResult = + pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); + assert(BotResult != NoCand && "failed to find the first candidate"); + (void)BotResult; + SU = BotCand.SU; + } + IsTopNode = false; + } else { + SU = pickNodeBidrectional(IsTopNode); + } + if (SU->isTopReady()) + Top.removeReady(SU); + if (SU->isBottomReady()) + Bot.removeReady(SU); + + LLVM_DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom") + << " Scheduling instruction in cycle " + << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << " (" + << reportPackets() << ")\n"; + DAG->dumpNode(*SU)); + return SU; +} + +/// Update the scheduler's state after scheduling a node. This is the same node +/// that was just returned by pickNode(). However, VLIWMachineScheduler needs +/// to update it's state based on the current cycle before MachineSchedStrategy +/// does. +void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) { + if (IsTopNode) { + Top.bumpNode(SU); + SU->TopReadyCycle = Top.CurrCycle; + } else { + Bot.bumpNode(SU); + SU->BotReadyCycle = Bot.CurrCycle; + } +} diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 4876b9e23717..0c42bef82005 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -201,9 +201,11 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const { case MVT::x86amx: return Type::getX86_AMXTy(Context); case MVT::i64x8: return IntegerType::get(Context, 512); case MVT::externref: + // pointer to opaque struct in addrspace(10) return PointerType::get(StructType::create(Context), 10); case MVT::funcref: - return PointerType::get(StructType::create(Context), 20); + // pointer to i8 addrspace(20) + return PointerType::get(Type::getInt8Ty(Context), 20); case MVT::v1i1: return FixedVectorType::get(Type::getInt1Ty(Context), 1); case MVT::v2i1: diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp index 4564aa1c1278..d31183e46d65 100644 --- a/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -573,9 +573,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, const auto *CatchSwitch = cast<CatchSwitchInst>(Pad); int CatchState = -1, FollowerState = -1; SmallVector<const BasicBlock *, 4> CatchBlocks(CatchSwitch->handlers()); - for (auto CBI = CatchBlocks.rbegin(), CBE = CatchBlocks.rend(); - CBI != CBE; ++CBI, FollowerState = CatchState) { - const BasicBlock *CatchBlock = *CBI; + for (const BasicBlock *CatchBlock : llvm::reverse(CatchBlocks)) { // Create the entry for this catch with the appropriate handler // properties. const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI()); @@ -591,6 +589,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, Worklist.emplace_back(I, CatchState); // Remember this catch's state. FuncInfo.EHPadStateMap[Catch] = CatchState; + FollowerState = CatchState; } // Associate the catchswitch with the state of its first catch. assert(CatchSwitch->getNumHandlers()); @@ -601,11 +600,9 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, // Step two: record the TryParentState of each state. For cleanuppads that // don't have cleanuprets, we may need to infer this from their child pads, // so visit pads in descendant-most to ancestor-most order. - for (auto Entry = FuncInfo.ClrEHUnwindMap.rbegin(), - End = FuncInfo.ClrEHUnwindMap.rend(); - Entry != End; ++Entry) { + for (ClrEHUnwindMapEntry &Entry : llvm::reverse(FuncInfo.ClrEHUnwindMap)) { const Instruction *Pad = - Entry->Handler.get<const BasicBlock *>()->getFirstNonPHI(); + Entry.Handler.get<const BasicBlock *>()->getFirstNonPHI(); // For most pads, the TryParentState is the state associated with the // unwind dest of exceptional exits from it. const BasicBlock *UnwindDest; @@ -615,7 +612,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, // that's not the unwind dest of exceptions escaping the catch. Those // cases were already assigned a TryParentState in the first pass, so // skip them. - if (Entry->TryParentState != -1) + if (Entry.TryParentState != -1) continue; // Otherwise, get the unwind dest from the catchswitch. UnwindDest = Catch->getCatchSwitch()->getUnwindDest(); @@ -692,7 +689,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn, UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()]; } - Entry->TryParentState = UnwindDestState; + Entry.TryParentState = UnwindDestState; } // Step three: transfer information from pads to invokes. diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp index 11d1b309aa64..b66429d8a5bf 100644 --- a/llvm/lib/CodeGen/XRayInstrumentation.cpp +++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp @@ -226,6 +226,7 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { case Triple::ArchType::arm: case Triple::ArchType::thumb: case Triple::ArchType::aarch64: + case Triple::ArchType::hexagon: case Triple::ArchType::mips: case Triple::ArchType::mipsel: case Triple::ArchType::mips64: diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index a3dec6c25e44..ae0859e1ecfd 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -223,22 +223,21 @@ static void analyzeImportedModule( SysRoot = CU.getSysRoot(); if (!SysRoot.empty() && Path.startswith(SysRoot)) return; - if (Optional<DWARFFormValue> Val = DIE.find(dwarf::DW_AT_name)) - if (Optional<const char *> Name = Val->getAsCString()) { - auto &Entry = (*ParseableSwiftInterfaces)[*Name]; - // The prepend path is applied later when copying. - DWARFDie CUDie = CU.getOrigUnit().getUnitDIE(); - SmallString<128> ResolvedPath; - if (sys::path::is_relative(Path)) - resolveRelativeObjectPath(ResolvedPath, CUDie); - sys::path::append(ResolvedPath, Path); - if (!Entry.empty() && Entry != ResolvedPath) - ReportWarning( - Twine("Conflicting parseable interfaces for Swift Module ") + - *Name + ": " + Entry + " and " + Path, - DIE); - Entry = std::string(ResolvedPath.str()); - } + Optional<const char*> Name = dwarf::toString(DIE.find(dwarf::DW_AT_name)); + if (!Name) + return; + auto &Entry = (*ParseableSwiftInterfaces)[*Name]; + // The prepend path is applied later when copying. + DWARFDie CUDie = CU.getOrigUnit().getUnitDIE(); + SmallString<128> ResolvedPath; + if (sys::path::is_relative(Path)) + resolveRelativeObjectPath(ResolvedPath, CUDie); + sys::path::append(ResolvedPath, Path); + if (!Entry.empty() && Entry != ResolvedPath) + ReportWarning(Twine("Conflicting parseable interfaces for Swift Module ") + + *Name + ": " + Entry + " and " + Path, + DIE); + Entry = std::string(ResolvedPath.str()); } /// The distinct types of work performed by the work loop in @@ -409,10 +408,10 @@ static bool dieNeedsChildrenToBeMeaningful(uint32_t Tag) { void DWARFLinker::cleanupAuxiliarryData(LinkContext &Context) { Context.clear(); - for (auto I = DIEBlocks.begin(), E = DIEBlocks.end(); I != E; ++I) - (*I)->~DIEBlock(); - for (auto I = DIELocs.begin(), E = DIELocs.end(); I != E; ++I) - (*I)->~DIELoc(); + for (DIEBlock *I : DIEBlocks) + I->~DIEBlock(); + for (DIELoc *I : DIELocs) + I->~DIELoc(); DIEBlocks.clear(); DIELocs.clear(); @@ -846,7 +845,7 @@ void DWARFLinker::assignAbbrev(DIEAbbrev &Abbrev) { unsigned DWARFLinker::DIECloner::cloneStringAttribute( DIE &Die, AttributeSpec AttrSpec, const DWARFFormValue &Val, const DWARFUnit &U, OffsetsStringPool &StringPool, AttributesInfo &Info) { - Optional<const char *> String = Val.getAsCString(); + Optional<const char *> String = dwarf::toString(Val); if (!String) return 0; @@ -1423,6 +1422,11 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE, Flags |= TF_InFunctionScope; if (!Info.InDebugMap && LLVM_LIKELY(!Update)) Flags |= TF_SkipPC; + } else if (Abbrev->getTag() == dwarf::DW_TAG_variable) { + // Function-local globals could be in the debug map even when the function + // is not, e.g., inlined functions. + if ((Flags & TF_InFunctionScope) && Info.InDebugMap) + Flags &= ~TF_SkipPC; } for (const auto &AttrSpec : Abbrev->attributes()) { diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp index 46e7457f2368..1ab6ead3b5f6 100644 --- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp +++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp @@ -531,9 +531,7 @@ void DwarfStreamer::emitLineTableForUnit(MCDwarfLineTableParams Params, unsigned RowsSinceLastSequence = 0; - for (unsigned Idx = 0; Idx < Rows.size(); ++Idx) { - auto &Row = Rows[Idx]; - + for (DWARFDebugLine::Row &Row : Rows) { int64_t AddressDelta; if (Address == -1ULL) { MS->emitIntValue(dwarf::DW_LNS_extended_op, 1); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index c8331487f282..95135c95e8d2 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -1195,7 +1195,7 @@ void DWARFContext::addLocalsForDie(DWARFCompileUnit *CU, DWARFDie Subprogram, Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin)) Die = Origin; if (auto NameAttr = Die.find(DW_AT_name)) - if (Optional<const char *> Name = NameAttr->getAsCString()) + if (Optional<const char *> Name = dwarf::toString(*NameAttr)) Local.Name = *Name; if (auto Type = Die.getAttributeValueAsReferencedDie(DW_AT_type)) Local.Size = getTypeSize(Type, getCUAddrSize()); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index bda41b1f34e9..f36d3f87257a 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -1331,8 +1331,8 @@ Optional<StringRef> DWARFDebugLine::LineTable::getSourceByIndex(uint64_t FileInd if (Kind == FileLineInfoKind::None || !Prologue.hasFileAtIndex(FileIndex)) return None; const FileNameEntry &Entry = Prologue.getFileNameEntry(FileIndex); - if (Optional<const char *> source = Entry.Source.getAsCString()) - return StringRef(*source); + if (auto E = dwarf::toString(Entry.Source)) + return StringRef(*E); return None; } @@ -1350,10 +1350,10 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex( if (Kind == FileLineInfoKind::None || !hasFileAtIndex(FileIndex)) return false; const FileNameEntry &Entry = getFileNameEntry(FileIndex); - Optional<const char *> Name = Entry.Name.getAsCString(); - if (!Name) + auto E = dwarf::toString(Entry.Name); + if (!E) return false; - StringRef FileName = *Name; + StringRef FileName = *E; if (Kind == FileLineInfoKind::RawValue || isPathAbsoluteOnWindowsOrPosix(FileName)) { Result = std::string(FileName); @@ -1372,11 +1372,10 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex( // relative names. if ((Entry.DirIdx != 0 || Kind != FileLineInfoKind::RelativeFilePath) && Entry.DirIdx < IncludeDirectories.size()) - IncludeDir = IncludeDirectories[Entry.DirIdx].getAsCString().getValue(); + IncludeDir = dwarf::toStringRef(IncludeDirectories[Entry.DirIdx]); } else { if (0 < Entry.DirIdx && Entry.DirIdx <= IncludeDirectories.size()) - IncludeDir = - IncludeDirectories[Entry.DirIdx - 1].getAsCString().getValue(); + IncludeDir = dwarf::toStringRef(IncludeDirectories[Entry.DirIdx - 1]); } // For absolute paths only, include the compilation directory of compile unit. diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp index cdffb36741c8..f39c7871d603 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp @@ -41,9 +41,7 @@ public: } // namespace static Error createResolverError(uint32_t Index, unsigned Kind) { - return createStringError(errc::invalid_argument, - "Unable to resolve indirect address %u for: %s", - Index, dwarf::LocListEncodingString(Kind).data()); + return make_error<ResolverError>(Index, (dwarf::LoclistEntries)Kind); } Expected<Optional<DWARFLocationExpression>> @@ -404,3 +402,10 @@ void DWARFDebugLoclists::dumpRange(uint64_t StartOffset, uint64_t Size, OS << '\n'; } } + +void llvm::ResolverError::log(raw_ostream &OS) const { + OS << format("unable to resolve indirect address %u for: %s", Index, + dwarf::LocListEncodingString(Kind).data()); +} + +char llvm::ResolverError::ID; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp index 80ffd81b3403..7a81d7ff064b 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp @@ -194,13 +194,11 @@ Error DWARFDebugMacro::parseImpl( if (MacroContributionOffset == MacroToUnits.end()) return createStringError(errc::invalid_argument, "Macro contribution of the unit not found"); - Optional<uint64_t> StrOffset = + Expected<uint64_t> StrOffset = MacroContributionOffset->second->getStringOffsetSectionItem( Data.getULEB128(&Offset)); if (!StrOffset) - return createStringError( - errc::invalid_argument, - "String offsets contribution of the unit not found"); + return StrOffset.takeError(); E.MacroStr = MacroContributionOffset->second->getStringExtractor().getCStr( &*StrOffset); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp index ed50f2635738..5421b2d59a1b 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -89,7 +89,6 @@ static void dumpLocationList(raw_ostream &OS, const DWARFFormValue &FormValue, U->getLocationTable().dumpLocationList(&Offset, OS, U->getBaseAddress(), MRI, Ctx.getDWARFObj(), U, DumpOpts, Indent); - return; } static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue, @@ -105,7 +104,6 @@ static void dumpLocationExpr(raw_ostream &OS, const DWARFFormValue &FormValue, Ctx.isLittleEndian(), 0); DWARFExpression(Data, U->getAddressByteSize(), U->getFormParams().Format) .print(OS, DumpOpts, MRI, U); - return; } static DWARFDie resolveReferencedType(DWARFDie D, @@ -672,6 +670,8 @@ struct DWARFTypePrinter { return; if (D.getTag() == DW_TAG_subprogram) return; + if (D.getTag() == DW_TAG_lexical_block) + return; D = D.resolveTypeUnitReference(); if (DWARFDie P = D.getParent()) appendScopes(P); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp index d0fbd702e831..e19f5b8138fa 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp @@ -217,8 +217,8 @@ static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS, if (DumpOpts.Verbose) OS << format("0x%08" PRIx64 " -> ", Operands[Operand]); OS << format("0x%08" PRIx64 ")", U->getOffset() + Operands[Operand]); - if (auto Name = Die.find(dwarf::DW_AT_name)) - OS << " \"" << Name->getAsCString() << "\""; + if (auto Name = dwarf::toString(Die.find(dwarf::DW_AT_name))) + OS << " \"" << *Name << "\""; } else { OS << format(" <invalid base_type ref: 0x%" PRIx64 ">", Operands[Operand]); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp index cea0f63bbf81..86991a3949dd 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp @@ -613,50 +613,53 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const { } void DWARFFormValue::dumpString(raw_ostream &OS) const { - Optional<const char *> DbgStr = getAsCString(); - if (DbgStr.hasValue()) { + if (auto DbgStr = dwarf::toString(*this)) { auto COS = WithColor(OS, HighlightColor::String); COS.get() << '"'; - COS.get().write_escaped(DbgStr.getValue()); + COS.get().write_escaped(*DbgStr); COS.get() << '"'; } } -Optional<const char *> DWARFFormValue::getAsCString() const { +Expected<const char *> DWARFFormValue::getAsCString() const { if (!isFormClass(FC_String)) - return None; + return make_error<StringError>("Invalid form for string attribute", + inconvertibleErrorCode()); if (Form == DW_FORM_string) return Value.cstr; // FIXME: Add support for DW_FORM_GNU_strp_alt if (Form == DW_FORM_GNU_strp_alt || C == nullptr) - return None; + return make_error<StringError>("Unsupported form for string attribute", + inconvertibleErrorCode()); uint64_t Offset = Value.uval; - if (Form == DW_FORM_line_strp) { - // .debug_line_str is tracked in the Context. - if (const char *Str = C->getLineStringExtractor().getCStr(&Offset)) - return Str; - return None; - } + Optional<uint32_t> Index; if (Form == DW_FORM_GNU_str_index || Form == DW_FORM_strx || Form == DW_FORM_strx1 || Form == DW_FORM_strx2 || Form == DW_FORM_strx3 || Form == DW_FORM_strx4) { if (!U) - return None; - Optional<uint64_t> StrOffset = U->getStringOffsetSectionItem(Offset); + return make_error<StringError>("API limitation - string extraction not " + "available without a DWARFUnit", + inconvertibleErrorCode()); + Expected<uint64_t> StrOffset = U->getStringOffsetSectionItem(Offset); + Index = Offset; if (!StrOffset) - return None; + return StrOffset.takeError(); Offset = *StrOffset; } // Prefer the Unit's string extractor, because for .dwo it will point to // .debug_str.dwo, while the Context's extractor always uses .debug_str. - if (U) { - if (const char *Str = U->getStringExtractor().getCStr(&Offset)) - return Str; - return None; - } - if (const char *Str = C->getStringExtractor().getCStr(&Offset)) + DataExtractor StrData = Form == DW_FORM_line_strp + ? C->getLineStringExtractor() + : U ? U->getStringExtractor() + : C->getStringExtractor(); + if (const char *Str = StrData.getCStr(&Offset)) return Str; - return None; + std::string Msg = FormEncodingString(Form).str(); + if (Index) + Msg += (" uses index " + Twine(*Index) + ", but the referenced string").str(); + Msg += (" offset " + Twine(Offset) + " is beyond .debug_str bounds").str(); + return make_error<StringError>(Msg, + inconvertibleErrorCode()); } Optional<uint64_t> DWARFFormValue::getAsAddress() const { diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp index 82c34f537036..eed0a60ec75e 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -214,13 +214,17 @@ DWARFUnit::getAddrOffsetSectionItem(uint32_t Index) const { return {{Address, Section}}; } -Optional<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const { +Expected<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const { if (!StringOffsetsTableContribution) - return None; + return make_error<StringError>( + "DW_FORM_strx used without a valid string offsets table", + inconvertibleErrorCode()); unsigned ItemSize = getDwarfStringOffsetsByteSize(); uint64_t Offset = getStringOffsetsBase() + Index * ItemSize; if (StringOffsetSection.Data.size() < Offset + ItemSize) - return None; + return make_error<StringError>("DW_FORM_strx uses index " + Twine(Index) + + ", which is too large", + inconvertibleErrorCode()); DWARFDataExtractor DA(Context.getDWARFObj(), StringOffsetSection, isLittleEndian, 0); return DA.getRelocatedValue(ItemSize, &Offset); @@ -603,7 +607,7 @@ bool DWARFUnit::parseDWO() { DWO->setAddrOffsetSection(AddrOffsetSection, *AddrOffsetSectionBase); if (getVersion() == 4) { auto DWORangesBase = UnitDie.getRangesBaseAttribute(); - DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0); + DWO->setRangesSection(RangeSection, DWORangesBase.getValueOr(0)); } return true; diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 7673a721c4ea..6424c2f59844 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -390,6 +390,9 @@ bool DWARFVerifier::handleDebugInfo() { OS << "Verifying non-dwo Units...\n"; NumErrors += verifyUnits(DCtx.getNormalUnitsVector()); + + OS << "Verifying dwo Units...\n"; + NumErrors += verifyUnits(DCtx.getDWOUnitsVector()); return NumErrors == 0; } @@ -400,10 +403,13 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die, if (!Die.isValid()) return NumErrors; + DWARFUnit *Unit = Die.getDwarfUnit(); + auto RangesOrError = Die.getAddressRanges(); if (!RangesOrError) { // FIXME: Report the error. - ++NumErrors; + if (!Unit->isDWOUnit()) + ++NumErrors; llvm::consumeError(RangesOrError.takeError()); return NumErrors; } @@ -496,15 +502,18 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, }; const DWARFObject &DObj = DCtx.getDWARFObj(); + DWARFUnit *U = Die.getDwarfUnit(); const auto Attr = AttrValue.Attr; switch (Attr) { case DW_AT_ranges: // Make sure the offset in the DW_AT_ranges attribute is valid. if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) { - unsigned DwarfVersion = Die.getDwarfUnit()->getVersion(); + unsigned DwarfVersion = U->getVersion(); const DWARFSection &RangeSection = DwarfVersion < 5 ? DObj.getRangesSection() : DObj.getRnglistsSection(); + if (U->isDWOUnit() && RangeSection.Data.empty()) + break; if (*SectionOffset >= RangeSection.Data.size()) ReportError( "DW_AT_ranges offset is beyond " + @@ -517,7 +526,7 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, case DW_AT_stmt_list: // Make sure the offset in the DW_AT_stmt_list attribute is valid. if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) { - if (*SectionOffset >= DObj.getLineSection().Data.size()) + if (*SectionOffset >= U->getLineSection().Data.size()) ReportError("DW_AT_stmt_list offset is beyond .debug_line bounds: " + llvm::formatv("{0:x8}", *SectionOffset)); break; @@ -525,9 +534,18 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, ReportError("DIE has invalid DW_AT_stmt_list encoding:"); break; case DW_AT_location: { + // FIXME: It might be nice if there's a way to walk location expressions + // without trying to resolve the address ranges - it'd be a more efficient + // API (since the API is currently unnecessarily resolving addresses for + // this use case which only wants to validate the expressions themselves) & + // then the expressions could be validated even if the addresses can't be + // resolved. + // That sort of API would probably look like a callback "for each + // expression" with some way to lazily resolve the address ranges when + // needed (& then the existing API used here could be built on top of that - + // using the callback API to build the data structure and return it). if (Expected<std::vector<DWARFLocationExpression>> Loc = Die.getLocations(DW_AT_location)) { - DWARFUnit *U = Die.getDwarfUnit(); for (const auto &Entry : *Loc) { DataExtractor Data(toStringRef(Entry.Expr), DCtx.isLittleEndian(), 0); DWARFExpression Expression(Data, U->getAddressByteSize(), @@ -539,8 +557,12 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, if (Error || !Expression.verify(U)) ReportError("DIE contains invalid DWARF expression:"); } - } else - ReportError(toString(Loc.takeError())); + } else if (Error Err = handleErrors( + Loc.takeError(), [&](std::unique_ptr<ResolverError> E) { + return U->isDWOUnit() ? Error::success() + : Error(std::move(E)); + })) + ReportError(toString(std::move(Err))); break; } case DW_AT_specification: @@ -576,7 +598,8 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, case DW_AT_call_file: case DW_AT_decl_file: { if (auto FileIdx = AttrValue.Value.getAsUnsignedConstant()) { - DWARFUnit *U = Die.getDwarfUnit(); + if (U->isDWOUnit() && !U->isTypeUnit()) + break; const auto *LT = U->getContext().getLineTableForUnit(U); if (LT) { if (!LT->hasFileAtIndex(*FileIdx)) { @@ -616,7 +639,6 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue, ReferenceMap &LocalReferences, ReferenceMap &CrossUnitReferences) { - const DWARFObject &DObj = DCtx.getDWARFObj(); auto DieCU = Die.getDwarfUnit(); unsigned NumErrors = 0; const auto Form = AttrValue.Value.getForm(); @@ -667,51 +689,15 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, } break; } - case DW_FORM_strp: { - auto SecOffset = AttrValue.Value.getAsSectionOffset(); - assert(SecOffset); // DW_FORM_strp is a section offset. - if (SecOffset && *SecOffset >= DObj.getStrSection().size()) { - ++NumErrors; - error() << "DW_FORM_strp offset beyond .debug_str bounds:\n"; - dump(Die) << '\n'; - } - break; - } + case DW_FORM_strp: case DW_FORM_strx: case DW_FORM_strx1: case DW_FORM_strx2: case DW_FORM_strx3: case DW_FORM_strx4: { - auto Index = AttrValue.Value.getRawUValue(); - auto DieCU = Die.getDwarfUnit(); - // Check that we have a valid DWARF v5 string offsets table. - if (!DieCU->getStringOffsetsTableContribution()) { - ++NumErrors; - error() << FormEncodingString(Form) - << " used without a valid string offsets table:\n"; - dump(Die) << '\n'; - break; - } - // Check that the index is within the bounds of the section. - unsigned ItemSize = DieCU->getDwarfStringOffsetsByteSize(); - // Use a 64-bit type to calculate the offset to guard against overflow. - uint64_t Offset = - (uint64_t)DieCU->getStringOffsetsBase() + Index * ItemSize; - if (DObj.getStrOffsetsSection().Data.size() < Offset + ItemSize) { - ++NumErrors; - error() << FormEncodingString(Form) << " uses index " - << format("%" PRIu64, Index) << ", which is too large:\n"; - dump(Die) << '\n'; - break; - } - // Check that the string offset is valid. - uint64_t StringOffset = *DieCU->getStringOffsetSectionItem(Index); - if (StringOffset >= DObj.getStrSection().size()) { + if (Error E = AttrValue.Value.getAsCString().takeError()) { ++NumErrors; - error() << FormEncodingString(Form) << " uses index " - << format("%" PRIu64, Index) - << ", but the referenced string" - " offset is beyond .debug_str bounds:\n"; + error() << toString(std::move(E)) << ":\n"; dump(Die) << '\n'; } break; diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp index b2c43b893cd3..6eef6f84ab40 100644 --- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp +++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp @@ -531,7 +531,7 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath) { << LR->Locations.size() << "\n"; Log << " " << NumDwarfInlineInfos << " DWARF frames:\n"; for (size_t Idx = 0; Idx < NumDwarfInlineInfos; ++Idx) { - const auto dii = DwarfInlineInfos.getFrame(Idx); + const auto &dii = DwarfInlineInfos.getFrame(Idx); Log << " [" << Idx << "]: " << dii.FunctionName << " @ " << dii.FileName << ':' << dii.Line << '\n'; } @@ -551,7 +551,7 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath) { ++Idx) { const auto &gii = LR->Locations[Idx]; if (Idx < NumDwarfInlineInfos) { - const auto dii = DwarfInlineInfos.getFrame(Idx); + const auto &dii = DwarfInlineInfos.getFrame(Idx); gsymFilename = LR->getSourceFile(Idx); // Verify function name if (dii.FunctionName.find(gii.Name.str()) != 0) diff --git a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp index 1a92e2cb7754..f9a763d724a8 100644 --- a/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp +++ b/llvm/lib/DebugInfo/MSF/MSFBuilder.cpp @@ -343,15 +343,25 @@ Expected<FileBufferByteStream> MSFBuilder::commit(StringRef Path, Layout = std::move(*L); uint64_t FileSize = uint64_t(Layout.SB->BlockSize) * Layout.SB->NumBlocks; - if (FileSize > UINT32_MAX) { - // FIXME: Changing the BinaryStream classes to use 64-bit numbers lets - // us create PDBs larger than 4 GiB successfully. The file format is - // block-based and as long as each stream is small enough, PDBs larger than - // 4 GiB might work. Check if tools can handle these large PDBs, and if so - // add support for writing them. + // Ensure that the file size is under the limit for the specified block size. + if (FileSize > getMaxFileSizeFromBlockSize(Layout.SB->BlockSize)) { + msf_error_code error_code = [](uint32_t BlockSize) { + switch (BlockSize) { + case 8192: + return msf_error_code::size_overflow_8192; + case 16384: + return msf_error_code::size_overflow_16384; + case 32768: + return msf_error_code::size_overflow_32768; + default: + return msf_error_code::size_overflow_4096; + } + }(Layout.SB->BlockSize); + return make_error<MSFError>( - msf_error_code::size_overflow, - formatv("File size would have been {0,1:N}", FileSize)); + error_code, + formatv("File size {0,1:N} too large for current PDB page size {1}", + FileSize, Layout.SB->BlockSize)); } auto OutFileOrError = FileOutputBuffer::create(Path, FileSize); diff --git a/llvm/lib/DebugInfo/MSF/MSFError.cpp b/llvm/lib/DebugInfo/MSF/MSFError.cpp index e42157e9d48e..9df2158423a4 100644 --- a/llvm/lib/DebugInfo/MSF/MSFError.cpp +++ b/llvm/lib/DebugInfo/MSF/MSFError.cpp @@ -28,8 +28,14 @@ public: case msf_error_code::insufficient_buffer: return "The buffer is not large enough to read the requested number of " "bytes."; - case msf_error_code::size_overflow: + case msf_error_code::size_overflow_4096: return "Output data is larger than 4 GiB."; + case msf_error_code::size_overflow_8192: + return "Output data is larger than 8 GiB."; + case msf_error_code::size_overflow_16384: + return "Output data is larger than 16 GiB."; + case msf_error_code::size_overflow_32768: + return "Output data is larger than 32 GiB."; case msf_error_code::not_writable: return "The specified stream is not writable."; case msf_error_code::no_stream: diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp index cde645236851..5c61530c470d 100644 --- a/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp +++ b/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp @@ -100,7 +100,7 @@ PDBFile::getStreamBlockList(uint32_t StreamIndex) const { return ContainerLayout.StreamMap[StreamIndex]; } -uint32_t PDBFile::getFileSize() const { return Buffer->getLength(); } +uint64_t PDBFile::getFileSize() const { return Buffer->getLength(); } Expected<ArrayRef<uint8_t>> PDBFile::getBlockData(uint32_t BlockIndex, uint32_t NumBytes) const { diff --git a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp index fd9a0deb54d6..f9e67014477e 100644 --- a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp +++ b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp @@ -518,8 +518,8 @@ SymbolCache::findLineTable(uint16_t Modi) const { const std::vector<LineTableEntry> &RHS) { return LHS[0].Addr < RHS[0].Addr; }); - for (size_t I = 0; I < EntryList.size(); ++I) - llvm::append_range(ModuleLineTable, EntryList[I]); + for (std::vector<LineTableEntry> &I : EntryList) + llvm::append_range(ModuleLineTable, I); return ModuleLineTable; } diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp index f3f09584fdc9..5ec79df17fed 100644 --- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp +++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp @@ -20,6 +20,7 @@ #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/PDB/PDB.h" #include "llvm/DebugInfo/PDB/PDBContext.h" +#include "llvm/Debuginfod/Debuginfod.h" #include "llvm/Demangle/Demangle.h" #include "llvm/Object/COFF.h" #include "llvm/Object/MachO.h" @@ -384,7 +385,14 @@ bool findDebugBinary(const std::vector<std::string> &DebugFileDirectory, } } } - return false; + // Try debuginfod client cache and known servers. + Expected<std::string> PathOrErr = getCachedOrDownloadDebuginfo(BuildID); + if (!PathOrErr) { + consumeError(PathOrErr.takeError()); + return false; + } + Result = *PathOrErr; + return true; } } // end anonymous namespace diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp new file mode 100644 index 000000000000..389b18fd62ac --- /dev/null +++ b/llvm/lib/Debuginfod/Debuginfod.cpp @@ -0,0 +1,183 @@ +//===-- llvm/Debuginfod/Debuginfod.cpp - Debuginfod client library --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// +/// This file defines the fetchInfo function, which retrieves +/// any of the three supported artifact types: (executable, debuginfo, source +/// file) associated with a build-id from debuginfod servers. If a source file +/// is to be fetched, its absolute path must be specified in the Description +/// argument to fetchInfo. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Debuginfod/Debuginfod.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Debuginfod/HTTPClient.h" +#include "llvm/Support/CachePruning.h" +#include "llvm/Support/Caching.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/xxhash.h" + +namespace llvm { +static std::string uniqueKey(llvm::StringRef S) { return utostr(xxHash64(S)); } + +// Returns a binary BuildID as a normalized hex string. +// Uses lowercase for compatibility with common debuginfod servers. +static std::string buildIDToString(BuildIDRef ID) { + return llvm::toHex(ID, /*LowerCase=*/true); +} + +Expected<SmallVector<StringRef>> getDefaultDebuginfodUrls() { + const char *DebuginfodUrlsEnv = std::getenv("DEBUGINFOD_URLS"); + if (DebuginfodUrlsEnv == NULL) + return SmallVector<StringRef>(); + + SmallVector<StringRef> DebuginfodUrls; + StringRef(DebuginfodUrlsEnv).split(DebuginfodUrls, " "); + return DebuginfodUrls; +} + +Expected<std::string> getDefaultDebuginfodCacheDirectory() { + if (const char *CacheDirectoryEnv = std::getenv("DEBUGINFOD_CACHE_PATH")) + return CacheDirectoryEnv; + + SmallString<64> CacheDirectory; + if (!sys::path::cache_directory(CacheDirectory)) + return createStringError( + errc::io_error, "Unable to determine appropriate cache directory."); + return std::string(CacheDirectory); +} + +std::chrono::milliseconds getDefaultDebuginfodTimeout() { + long Timeout; + const char *DebuginfodTimeoutEnv = std::getenv("DEBUGINFOD_TIMEOUT"); + if (DebuginfodTimeoutEnv && + to_integer(StringRef(DebuginfodTimeoutEnv).trim(), Timeout, 10)) + return std::chrono::milliseconds(Timeout * 1000); + + return std::chrono::milliseconds(90 * 1000); +} + +/// The following functions fetch a debuginfod artifact to a file in a local +/// cache and return the cached file path. They first search the local cache, +/// followed by the debuginfod servers. + +Expected<std::string> getCachedOrDownloadSource(BuildIDRef ID, + StringRef SourceFilePath) { + SmallString<64> UrlPath; + sys::path::append(UrlPath, sys::path::Style::posix, "buildid", + buildIDToString(ID), "source", + sys::path::convert_to_slash(SourceFilePath)); + return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath); +} + +Expected<std::string> getCachedOrDownloadExecutable(BuildIDRef ID) { + SmallString<64> UrlPath; + sys::path::append(UrlPath, sys::path::Style::posix, "buildid", + buildIDToString(ID), "executable"); + return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath); +} + +Expected<std::string> getCachedOrDownloadDebuginfo(BuildIDRef ID) { + SmallString<64> UrlPath; + sys::path::append(UrlPath, sys::path::Style::posix, "buildid", + buildIDToString(ID), "debuginfo"); + return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath); +} + +// General fetching function. +Expected<std::string> getCachedOrDownloadArtifact(StringRef UniqueKey, + StringRef UrlPath) { + SmallString<10> CacheDir; + + Expected<std::string> CacheDirOrErr = getDefaultDebuginfodCacheDirectory(); + if (!CacheDirOrErr) + return CacheDirOrErr.takeError(); + CacheDir = *CacheDirOrErr; + + Expected<SmallVector<StringRef>> DebuginfodUrlsOrErr = + getDefaultDebuginfodUrls(); + if (!DebuginfodUrlsOrErr) + return DebuginfodUrlsOrErr.takeError(); + SmallVector<StringRef> &DebuginfodUrls = *DebuginfodUrlsOrErr; + return getCachedOrDownloadArtifact(UniqueKey, UrlPath, CacheDir, + DebuginfodUrls, + getDefaultDebuginfodTimeout()); +} + +Expected<std::string> getCachedOrDownloadArtifact( + StringRef UniqueKey, StringRef UrlPath, StringRef CacheDirectoryPath, + ArrayRef<StringRef> DebuginfodUrls, std::chrono::milliseconds Timeout) { + SmallString<64> AbsCachedArtifactPath; + sys::path::append(AbsCachedArtifactPath, CacheDirectoryPath, + "llvmcache-" + UniqueKey); + + Expected<FileCache> CacheOrErr = + localCache("Debuginfod-client", ".debuginfod-client", CacheDirectoryPath); + if (!CacheOrErr) + return CacheOrErr.takeError(); + + FileCache Cache = *CacheOrErr; + // We choose an arbitrary Task parameter as we do not make use of it. + unsigned Task = 0; + Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, UniqueKey); + if (!CacheAddStreamOrErr) + return CacheAddStreamOrErr.takeError(); + AddStreamFn &CacheAddStream = *CacheAddStreamOrErr; + if (!CacheAddStream) + return std::string(AbsCachedArtifactPath); + // The artifact was not found in the local cache, query the debuginfod + // servers. + if (!HTTPClient::isAvailable()) + return createStringError(errc::io_error, + "No working HTTP client is available."); + + if (!HTTPClient::IsInitialized) + return createStringError( + errc::io_error, + "A working HTTP client is available, but it is not initialized. To " + "allow Debuginfod to make HTTP requests, call HTTPClient::initialize() " + "at the beginning of main."); + + HTTPClient Client; + Client.setTimeout(Timeout); + for (StringRef ServerUrl : DebuginfodUrls) { + SmallString<64> ArtifactUrl; + sys::path::append(ArtifactUrl, sys::path::Style::posix, ServerUrl, UrlPath); + + Expected<HTTPResponseBuffer> ResponseOrErr = Client.get(ArtifactUrl); + if (!ResponseOrErr) + return ResponseOrErr.takeError(); + + HTTPResponseBuffer &Response = *ResponseOrErr; + if (Response.Code != 200) + continue; + + // We have retrieved the artifact from this server, and now add it to the + // file cache. + Expected<std::unique_ptr<CachedFileStream>> FileStreamOrErr = + CacheAddStream(Task); + if (!FileStreamOrErr) + return FileStreamOrErr.takeError(); + std::unique_ptr<CachedFileStream> &FileStream = *FileStreamOrErr; + if (!Response.Body) + return createStringError( + errc::io_error, "Unallocated MemoryBuffer in HTTPResponseBuffer."); + + *FileStream->OS << StringRef(Response.Body->getBufferStart(), + Response.Body->getBufferSize()); + + // Return the path to the artifact on disk. + return std::string(AbsCachedArtifactPath); + } + + return createStringError(errc::argument_out_of_domain, "build id not found"); +} +} // namespace llvm diff --git a/llvm/lib/Debuginfod/HTTPClient.cpp b/llvm/lib/Debuginfod/HTTPClient.cpp new file mode 100644 index 000000000000..65f457933b92 --- /dev/null +++ b/llvm/lib/Debuginfod/HTTPClient.cpp @@ -0,0 +1,216 @@ +//===-- llvm/Debuginfod/HTTPClient.cpp - HTTP client library ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// +/// This file defines the methods of the HTTPRequest, HTTPClient, and +/// BufferedHTTPResponseHandler classes. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Debuginfod/HTTPClient.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MemoryBuffer.h" +#ifdef LLVM_ENABLE_CURL +#include <curl/curl.h> +#endif + +using namespace llvm; + +HTTPRequest::HTTPRequest(StringRef Url) { this->Url = Url.str(); } + +bool operator==(const HTTPRequest &A, const HTTPRequest &B) { + return A.Url == B.Url && A.Method == B.Method && + A.FollowRedirects == B.FollowRedirects; +} + +HTTPResponseHandler::~HTTPResponseHandler() = default; + +static inline bool parseContentLengthHeader(StringRef LineRef, + size_t &ContentLength) { + // Content-Length is a mandatory header, and the only one we handle. + return LineRef.consume_front("Content-Length: ") && + to_integer(LineRef.trim(), ContentLength, 10); +} + +Error BufferedHTTPResponseHandler::handleHeaderLine(StringRef HeaderLine) { + if (ResponseBuffer.Body) + return Error::success(); + + size_t ContentLength; + if (parseContentLengthHeader(HeaderLine, ContentLength)) + ResponseBuffer.Body = + WritableMemoryBuffer::getNewUninitMemBuffer(ContentLength); + + return Error::success(); +} + +Error BufferedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) { + if (!ResponseBuffer.Body) + return createStringError(errc::io_error, + "Unallocated response buffer. HTTP Body data " + "received before Content-Length header."); + if (Offset + BodyChunk.size() > ResponseBuffer.Body->getBufferSize()) + return createStringError(errc::io_error, + "Content size exceeds buffer size."); + memcpy(ResponseBuffer.Body->getBufferStart() + Offset, BodyChunk.data(), + BodyChunk.size()); + Offset += BodyChunk.size(); + return Error::success(); +} + +Error BufferedHTTPResponseHandler::handleStatusCode(unsigned Code) { + ResponseBuffer.Code = Code; + return Error::success(); +} + +bool HTTPClient::IsInitialized = false; + +class HTTPClientCleanup { +public: + ~HTTPClientCleanup() { HTTPClient::cleanup(); } +}; +static const HTTPClientCleanup Cleanup; + +Expected<HTTPResponseBuffer> HTTPClient::perform(const HTTPRequest &Request) { + BufferedHTTPResponseHandler Handler; + if (Error Err = perform(Request, Handler)) + return std::move(Err); + return std::move(Handler.ResponseBuffer); +} + +Expected<HTTPResponseBuffer> HTTPClient::get(StringRef Url) { + HTTPRequest Request(Url); + return perform(Request); +} + +#ifdef LLVM_ENABLE_CURL + +bool HTTPClient::isAvailable() { return true; } + +void HTTPClient::initialize() { + if (!IsInitialized) { + curl_global_init(CURL_GLOBAL_ALL); + IsInitialized = true; + } +} + +void HTTPClient::cleanup() { + if (IsInitialized) { + curl_global_cleanup(); + IsInitialized = false; + } +} + +void HTTPClient::setTimeout(std::chrono::milliseconds Timeout) { + if (Timeout < std::chrono::milliseconds(0)) + Timeout = std::chrono::milliseconds(0); + curl_easy_setopt(Curl, CURLOPT_TIMEOUT_MS, Timeout.count()); +} + +/// CurlHTTPRequest and the curl{Header,Write}Function are implementation +/// details used to work with Curl. Curl makes callbacks with a single +/// customizable pointer parameter. +struct CurlHTTPRequest { + CurlHTTPRequest(HTTPResponseHandler &Handler) : Handler(Handler) {} + void storeError(Error Err) { + ErrorState = joinErrors(std::move(Err), std::move(ErrorState)); + } + HTTPResponseHandler &Handler; + llvm::Error ErrorState = Error::success(); +}; + +static size_t curlHeaderFunction(char *Contents, size_t Size, size_t NMemb, + CurlHTTPRequest *CurlRequest) { + assert(Size == 1 && "The Size passed by libCURL to CURLOPT_HEADERFUNCTION " + "should always be 1."); + if (Error Err = + CurlRequest->Handler.handleHeaderLine(StringRef(Contents, NMemb))) { + CurlRequest->storeError(std::move(Err)); + return 0; + } + return NMemb; +} + +static size_t curlWriteFunction(char *Contents, size_t Size, size_t NMemb, + CurlHTTPRequest *CurlRequest) { + Size *= NMemb; + if (Error Err = + CurlRequest->Handler.handleBodyChunk(StringRef(Contents, Size))) { + CurlRequest->storeError(std::move(Err)); + return 0; + } + return Size; +} + +HTTPClient::HTTPClient() { + assert(IsInitialized && + "Must call HTTPClient::initialize() at the beginning of main()."); + if (Curl) + return; + assert((Curl = curl_easy_init()) && "Curl could not be initialized."); + // Set the callback hooks. + curl_easy_setopt(Curl, CURLOPT_WRITEFUNCTION, curlWriteFunction); + curl_easy_setopt(Curl, CURLOPT_HEADERFUNCTION, curlHeaderFunction); +} + +HTTPClient::~HTTPClient() { curl_easy_cleanup(Curl); } + +Error HTTPClient::perform(const HTTPRequest &Request, + HTTPResponseHandler &Handler) { + if (Request.Method != HTTPMethod::GET) + return createStringError(errc::invalid_argument, + "Unsupported CURL request method."); + + SmallString<128> Url = Request.Url; + curl_easy_setopt(Curl, CURLOPT_URL, Url.c_str()); + curl_easy_setopt(Curl, CURLOPT_FOLLOWLOCATION, Request.FollowRedirects); + + CurlHTTPRequest CurlRequest(Handler); + curl_easy_setopt(Curl, CURLOPT_WRITEDATA, &CurlRequest); + curl_easy_setopt(Curl, CURLOPT_HEADERDATA, &CurlRequest); + CURLcode CurlRes = curl_easy_perform(Curl); + if (CurlRes != CURLE_OK) + return joinErrors(std::move(CurlRequest.ErrorState), + createStringError(errc::io_error, + "curl_easy_perform() failed: %s\n", + curl_easy_strerror(CurlRes))); + if (CurlRequest.ErrorState) + return std::move(CurlRequest.ErrorState); + + unsigned Code; + curl_easy_getinfo(Curl, CURLINFO_RESPONSE_CODE, &Code); + if (Error Err = Handler.handleStatusCode(Code)) + return joinErrors(std::move(CurlRequest.ErrorState), std::move(Err)); + + return std::move(CurlRequest.ErrorState); +} + +#else + +HTTPClient::HTTPClient() = default; + +HTTPClient::~HTTPClient() = default; + +bool HTTPClient::isAvailable() { return false; } + +void HTTPClient::initialize() {} + +void HTTPClient::cleanup() {} + +void HTTPClient::setTimeout(std::chrono::milliseconds Timeout) {} + +Error HTTPClient::perform(const HTTPRequest &Request, + HTTPResponseHandler &Handler) { + llvm_unreachable("No HTTP Client implementation available."); +} + +#endif diff --git a/llvm/lib/Demangle/DLangDemangle.cpp b/llvm/lib/Demangle/DLangDemangle.cpp index f380aa90035e..0cefbd63a7ae 100644 --- a/llvm/lib/Demangle/DLangDemangle.cpp +++ b/llvm/lib/Demangle/DLangDemangle.cpp @@ -242,11 +242,77 @@ const char *Demangler::parseIdentifier(OutputBuffer *Demangled, // TODO: Parse template instances with a length prefix. + // There can be multiple different declarations in the same function that + // have the same mangled name. To make the mangled names unique, a fake + // parent in the form `__Sddd' is added to the symbol. + if (Len >= 4 && Mangled[0] == '_' && Mangled[1] == '_' && Mangled[2] == 'S') { + const char *NumPtr = Mangled + 3; + while (NumPtr < (Mangled + Len) && std::isdigit(*NumPtr)) + ++NumPtr; + + if (Mangled + Len == NumPtr) { + // Skip over the fake parent. + Mangled += Len; + return parseIdentifier(Demangled, Mangled); + } + + // Else demangle it as a plain identifier. + } + return parseLName(Demangled, Mangled, Len); } const char *Demangler::parseLName(OutputBuffer *Demangled, const char *Mangled, unsigned long Len) { + switch (Len) { + case 6: + if (strncmp(Mangled, "__initZ", Len + 1) == 0) { + // The static initializer for a given symbol. + Demangled->prepend("initializer for "); + Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1); + Mangled += Len; + return Mangled; + } + if (strncmp(Mangled, "__vtblZ", Len + 1) == 0) { + // The vtable symbol for a given class. + Demangled->prepend("vtable for "); + Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1); + Mangled += Len; + return Mangled; + } + break; + + case 7: + if (strncmp(Mangled, "__ClassZ", Len + 1) == 0) { + // The classinfo symbol for a given class. + Demangled->prepend("ClassInfo for "); + Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1); + Mangled += Len; + return Mangled; + } + break; + + case 11: + if (strncmp(Mangled, "__InterfaceZ", Len + 1) == 0) { + // The interface symbol for a given class. + Demangled->prepend("Interface for "); + Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1); + Mangled += Len; + return Mangled; + } + break; + + case 12: + if (strncmp(Mangled, "__ModuleInfoZ", Len + 1) == 0) { + // The ModuleInfo symbol for a given module. + Demangled->prepend("ModuleInfo for "); + Demangled->setCurrentPosition(Demangled->getCurrentPosition() - 1); + Mangled += Len; + return Mangled; + } + break; + } + *Demangled << StringView(Mangled, Len); Mangled += Len; diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp index 3ea9ffee6554..27d8833ae19e 100644 --- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp @@ -241,7 +241,9 @@ private: } case Branch32: { Kind = x86_64::BranchPCRel32; - Addend = 0; + // BranchPCRel32 implicitly handles the '-4' PC adjustment, so we have to + // adjust the addend by '+4' to compensate. + Addend += 4; break; } } @@ -252,7 +254,7 @@ private: Edge GE(Kind, Offset, *GraphSymbol, Addend); LLVM_DEBUG({ dbgs() << " "; - printEdge(dbgs(), *BlockToFix, GE, getELFX86RelocationKindName(Kind)); + printEdge(dbgs(), *BlockToFix, GE, x86_64::getEdgeKindName(Kind)); dbgs() << "\n"; }); diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp index 200f42aec067..ed912280ac82 100644 --- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp +++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp @@ -170,8 +170,8 @@ std::unique_ptr<MemoryBuffer> MCJIT::emitObject(Module *M) { PM.run(*M); // Flush the output buffer to get the generated code into memory - std::unique_ptr<MemoryBuffer> CompiledObjBuffer( - new SmallVectorMemoryBuffer(std::move(ObjBufferSV))); + auto CompiledObjBuffer = std::make_unique<SmallVectorMemoryBuffer>( + std::move(ObjBufferSV), /*RequiresNullTerminator=*/false); // If we have an object cache, tell it about the new object. // Note that we're using the compiled image, not the loaded image (as below). diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h index 52e7eda90310..a5dd420c9132 100644 --- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h +++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h @@ -151,12 +151,8 @@ class MCJIT : public ExecutionEngine { } void markAllLoadedModulesAsFinalized() { - for (ModulePtrSet::iterator I = LoadedModules.begin(), - E = LoadedModules.end(); - I != E; ++I) { - Module *M = *I; + for (Module *M : LoadedModules) FinalizedModules.insert(M); - } LoadedModules.clear(); } @@ -167,10 +163,8 @@ class MCJIT : public ExecutionEngine { void freeModulePtrSet(ModulePtrSet& MPS) { // Go through the module set and delete everything. - for (ModulePtrSet::iterator I = MPS.begin(), E = MPS.end(); I != E; ++I) { - Module *M = *I; + for (Module *M : MPS) delete M; - } MPS.clear(); } }; diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp index 9ff6cec8c6c5..e2a0cadb6348 100644 --- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp @@ -78,11 +78,10 @@ public: : IRMaterializationUnit(ES, MO, std::move(TSM)), Parent(Parent) {} PartitioningIRMaterializationUnit( - ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags, - SymbolStringPtr InitSymbol, SymbolNameToDefinitionMap SymbolToDefinition, + ThreadSafeModule TSM, Interface I, + SymbolNameToDefinitionMap SymbolToDefinition, CompileOnDemandLayer &Parent) - : IRMaterializationUnit(std::move(TSM), std::move(SymbolFlags), - std::move(InitSymbol), + : IRMaterializationUnit(std::move(TSM), std::move(I), std::move(SymbolToDefinition)), Parent(Parent) {} @@ -298,7 +297,9 @@ void CompileOnDemandLayer::emitPartition( if (GVsToExtract->empty()) { if (auto Err = R->replace(std::make_unique<PartitioningIRMaterializationUnit>( - std::move(TSM), R->getSymbols(), R->getInitializerSymbol(), + std::move(TSM), + MaterializationUnit::Interface(R->getSymbols(), + R->getInitializerSymbol()), std::move(Defs), *this))) { getExecutionSession().reportError(std::move(Err)); R->failMaterialization(); diff --git a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp index f8efed15edea..f34247005258 100644 --- a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp @@ -53,7 +53,8 @@ Expected<SimpleCompiler::CompileResult> SimpleCompiler::operator()(Module &M) { } auto ObjBuffer = std::make_unique<SmallVectorMemoryBuffer>( - std::move(ObjBufferSV), M.getModuleIdentifier() + "-jitted-objectbuffer"); + std::move(ObjBufferSV), M.getModuleIdentifier() + "-jitted-objectbuffer", + /*RequiresNullTerminator=*/false); auto Obj = object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef()); diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index 56a97f83d915..aa82cf38c45d 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -243,8 +243,7 @@ void AsynchronousSymbolQuery::detach() { AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit( SymbolMap Symbols) - : MaterializationUnit(extractFlags(Symbols), nullptr), - Symbols(std::move(Symbols)) {} + : MaterializationUnit(extractFlags(Symbols)), Symbols(std::move(Symbols)) {} StringRef AbsoluteSymbolsMaterializationUnit::getName() const { return "<Absolute Symbols>"; @@ -263,18 +262,18 @@ void AbsoluteSymbolsMaterializationUnit::discard(const JITDylib &JD, Symbols.erase(Name); } -SymbolFlagsMap +MaterializationUnit::Interface AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) { SymbolFlagsMap Flags; for (const auto &KV : Symbols) Flags[KV.first] = KV.second.getFlags(); - return Flags; + return MaterializationUnit::Interface(std::move(Flags), nullptr); } ReExportsMaterializationUnit::ReExportsMaterializationUnit( JITDylib *SourceJD, JITDylibLookupFlags SourceJDLookupFlags, SymbolAliasMap Aliases) - : MaterializationUnit(extractFlags(Aliases), nullptr), SourceJD(SourceJD), + : MaterializationUnit(extractFlags(Aliases)), SourceJD(SourceJD), SourceJDLookupFlags(SourceJDLookupFlags), Aliases(std::move(Aliases)) {} StringRef ReExportsMaterializationUnit::getName() const { @@ -456,13 +455,13 @@ void ReExportsMaterializationUnit::discard(const JITDylib &JD, Aliases.erase(Name); } -SymbolFlagsMap +MaterializationUnit::Interface ReExportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) { SymbolFlagsMap SymbolFlags; for (auto &KV : Aliases) SymbolFlags[KV.first] = KV.second.AliasFlags; - return SymbolFlags; + return MaterializationUnit::Interface(std::move(SymbolFlags), nullptr); } Expected<SymbolAliasMap> buildSimpleReexportsAliasMap(JITDylib &SourceJD, @@ -2492,10 +2491,19 @@ void ExecutionSession::OL_applyQueryPhase1( } } - // If we get here then we've moved on to the next JITDylib. - LLVM_DEBUG(dbgs() << "Phase 1 moving to next JITDylib.\n"); - ++IPLS->CurSearchOrderIndex; - IPLS->NewJITDylib = true; + if (IPLS->DefGeneratorCandidates.empty() && + IPLS->DefGeneratorNonCandidates.empty()) { + // Early out if there are no remaining symbols. + LLVM_DEBUG(dbgs() << "All symbols matched.\n"); + IPLS->CurSearchOrderIndex = IPLS->SearchOrder.size(); + break; + } else { + // If we get here then we've moved on to the next JITDylib with candidates + // remaining. + LLVM_DEBUG(dbgs() << "Phase 1 moving to next JITDylib.\n"); + ++IPLS->CurSearchOrderIndex; + IPLS->NewJITDylib = true; + } } // Remove any weakly referenced candidates that could not be found/generated. diff --git a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp index 8479495623b8..fe62138c790c 100644 --- a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp @@ -154,8 +154,24 @@ public: } DebugSecInfos.push_back({&Sec, Sec.getName().substr(0, SepPos), Sec.getName().substr(SepPos + 1), 0, 0}); - } else + } else { NonDebugSections.push_back(&Sec); + + // If the first block in the section has a non-zero alignment offset + // then we need to add a padding block, since the section command in + // the header doesn't allow for aligment offsets. + SectionRange R(Sec); + if (!R.empty()) { + auto &FB = *R.getFirstBlock(); + if (FB.getAlignmentOffset() != 0) { + auto Padding = G.allocateBuffer(FB.getAlignmentOffset()); + memset(Padding.data(), 0, Padding.size()); + G.createContentBlock(Sec, Padding, + FB.getAddress() - FB.getAlignmentOffset(), + FB.getAlignment(), 0); + } + } + } } // Create container block. diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp index b17d196f01b6..eded54f4bfb3 100644 --- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp @@ -28,8 +28,8 @@ class DSOHandleMaterializationUnit : public MaterializationUnit { public: DSOHandleMaterializationUnit(ELFNixPlatform &ENP, const SymbolStringPtr &DSOHandleSymbol) - : MaterializationUnit(createDSOHandleSectionSymbols(ENP, DSOHandleSymbol), - DSOHandleSymbol), + : MaterializationUnit( + createDSOHandleSectionInterface(ENP, DSOHandleSymbol)), ENP(ENP) {} StringRef getName() const override { return "DSOHandleMU"; } @@ -70,12 +70,13 @@ public: void discard(const JITDylib &JD, const SymbolStringPtr &Sym) override {} private: - static SymbolFlagsMap - createDSOHandleSectionSymbols(ELFNixPlatform &ENP, - const SymbolStringPtr &DSOHandleSymbol) { + static MaterializationUnit::Interface + createDSOHandleSectionInterface(ELFNixPlatform &ENP, + const SymbolStringPtr &DSOHandleSymbol) { SymbolFlagsMap SymbolFlags; SymbolFlags[DSOHandleSymbol] = JITSymbolFlags::Exported; - return SymbolFlags; + return MaterializationUnit::Interface(std::move(SymbolFlags), + DSOHandleSymbol); } ArrayRef<char> getDSOHandleContent(size_t PointerSize) { diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index 2ab9ed4f856b..ae2d47fb8c5e 100644 --- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -8,6 +8,7 @@ #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/Layer.h" +#include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" @@ -269,25 +270,30 @@ Error DynamicLibrarySearchGenerator::tryToGenerate( } Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>> -StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName) { +StaticLibraryDefinitionGenerator::Load( + ObjectLayer &L, const char *FileName, + GetObjectFileInterface GetObjFileInterface) { auto ArchiveBuffer = errorOrToExpected(MemoryBuffer::getFile(FileName)); if (!ArchiveBuffer) return ArchiveBuffer.takeError(); - return Create(L, std::move(*ArchiveBuffer)); + return Create(L, std::move(*ArchiveBuffer), std::move(GetObjFileInterface)); } Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>> -StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName, - const Triple &TT) { +StaticLibraryDefinitionGenerator::Load( + ObjectLayer &L, const char *FileName, const Triple &TT, + GetObjectFileInterface GetObjFileInterface) { + auto B = object::createBinary(FileName); if (!B) return B.takeError(); // If this is a regular archive then create an instance from it. if (isa<object::Archive>(B->getBinary())) - return Create(L, std::move(B->takeBinary().second)); + return Create(L, std::move(B->takeBinary().second), + std::move(GetObjFileInterface)); // If this is a universal binary then search for a slice matching the given // Triple. @@ -309,7 +315,8 @@ StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName, " .. " + formatv("{0:x}", Obj.getOffset() + Obj.getSize()) + ": " + SliceBuffer.getError().message(), SliceBuffer.getError()); - return Create(L, std::move(*SliceBuffer)); + return Create(L, std::move(*SliceBuffer), + std::move(GetObjFileInterface)); } } @@ -326,11 +333,13 @@ StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName, Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>> StaticLibraryDefinitionGenerator::Create( - ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer) { + ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer, + GetObjectFileInterface GetObjFileInterface) { Error Err = Error::success(); std::unique_ptr<StaticLibraryDefinitionGenerator> ADG( - new StaticLibraryDefinitionGenerator(L, std::move(ArchiveBuffer), Err)); + new StaticLibraryDefinitionGenerator( + L, std::move(ArchiveBuffer), std::move(GetObjFileInterface), Err)); if (Err) return std::move(Err); @@ -371,7 +380,12 @@ Error StaticLibraryDefinitionGenerator::tryToGenerate( MemoryBufferRef ChildBufferRef(ChildBufferInfo.first, ChildBufferInfo.second); - if (auto Err = L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef, false))) + auto I = GetObjFileInterface(L.getExecutionSession(), ChildBufferRef); + if (!I) + return I.takeError(); + + if (auto Err = L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef, false), + std::move(*I))) return Err; } @@ -379,9 +393,15 @@ Error StaticLibraryDefinitionGenerator::tryToGenerate( } StaticLibraryDefinitionGenerator::StaticLibraryDefinitionGenerator( - ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer, Error &Err) - : L(L), ArchiveBuffer(std::move(ArchiveBuffer)), - Archive(std::make_unique<object::Archive>(*this->ArchiveBuffer, Err)) {} + ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer, + GetObjectFileInterface GetObjFileInterface, Error &Err) + : L(L), GetObjFileInterface(std::move(GetObjFileInterface)), + ArchiveBuffer(std::move(ArchiveBuffer)), + Archive(std::make_unique<object::Archive>(*this->ArchiveBuffer, Err)) { + + if (!this->GetObjFileInterface) + this->GetObjFileInterface = getObjectFileInterface; +} } // End namespace orc. } // End namespace llvm. diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp index ee1630a2ffa8..f427271bb45d 100644 --- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp @@ -31,8 +31,8 @@ public: CompileCallbackMaterializationUnit(SymbolStringPtr Name, CompileFunction Compile) - : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}}), - nullptr), + : MaterializationUnit(Interface( + SymbolFlagsMap({{Name, JITSymbolFlags::Exported}}), nullptr)), Name(std::move(Name)), Compile(std::move(Compile)) {} StringRef getName() const override { return "<Compile Callbacks>"; } diff --git a/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/llvm/lib/ExecutionEngine/Orc/Layer.cpp index 20dfba23bf10..adb8861793b1 100644 --- a/llvm/lib/ExecutionEngine/Orc/Layer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Layer.cpp @@ -10,9 +10,8 @@ #include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h" #include "llvm/IR/Constants.h" -#include "llvm/Object/MachO.h" -#include "llvm/Object/ObjectFile.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "orc" @@ -33,7 +32,7 @@ Error IRLayer::add(ResourceTrackerSP RT, ThreadSafeModule TSM) { IRMaterializationUnit::IRMaterializationUnit( ExecutionSession &ES, const IRSymbolMapper::ManglingOptions &MO, ThreadSafeModule TSM) - : MaterializationUnit(SymbolFlagsMap(), nullptr), TSM(std::move(TSM)) { + : MaterializationUnit(Interface()), TSM(std::move(TSM)) { assert(this->TSM && "Module must not be null"); @@ -98,10 +97,10 @@ IRMaterializationUnit::IRMaterializationUnit( } IRMaterializationUnit::IRMaterializationUnit( - ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags, - SymbolStringPtr InitSymbol, SymbolNameToDefinitionMap SymbolToDefinition) - : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol)), - TSM(std::move(TSM)), SymbolToDefinition(std::move(SymbolToDefinition)) {} + ThreadSafeModule TSM, Interface I, + SymbolNameToDefinitionMap SymbolToDefinition) + : MaterializationUnit(std::move(I)), TSM(std::move(TSM)), + SymbolToDefinition(std::move(SymbolToDefinition)) {} StringRef IRMaterializationUnit::getName() const { if (TSM) @@ -161,37 +160,47 @@ ObjectLayer::ObjectLayer(ExecutionSession &ES) : ES(ES) {} ObjectLayer::~ObjectLayer() {} -Error ObjectLayer::add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O) { +Error ObjectLayer::add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O, + MaterializationUnit::Interface I) { assert(RT && "RT can not be null"); - auto ObjMU = BasicObjectLayerMaterializationUnit::Create(*this, std::move(O)); - if (!ObjMU) - return ObjMU.takeError(); auto &JD = RT->getJITDylib(); - return JD.define(std::move(*ObjMU), std::move(RT)); + return JD.define(std::make_unique<BasicObjectLayerMaterializationUnit>( + *this, std::move(O), std::move(I)), + std::move(RT)); +} + +Error ObjectLayer::add(ResourceTrackerSP RT, std::unique_ptr<MemoryBuffer> O) { + auto I = getObjectFileInterface(getExecutionSession(), O->getMemBufferRef()); + if (!I) + return I.takeError(); + return add(std::move(RT), std::move(O), std::move(*I)); +} + +Error ObjectLayer::add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O) { + auto I = getObjectFileInterface(getExecutionSession(), O->getMemBufferRef()); + if (!I) + return I.takeError(); + return add(JD, std::move(O), std::move(*I)); } Expected<std::unique_ptr<BasicObjectLayerMaterializationUnit>> BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L, std::unique_ptr<MemoryBuffer> O) { - auto ObjSymInfo = - getObjectSymbolInfo(L.getExecutionSession(), O->getMemBufferRef()); - if (!ObjSymInfo) - return ObjSymInfo.takeError(); + auto ObjInterface = + getObjectFileInterface(L.getExecutionSession(), O->getMemBufferRef()); - auto &SymbolFlags = ObjSymInfo->first; - auto &InitSymbol = ObjSymInfo->second; + if (!ObjInterface) + return ObjInterface.takeError(); return std::unique_ptr<BasicObjectLayerMaterializationUnit>( - new BasicObjectLayerMaterializationUnit( - L, std::move(O), std::move(SymbolFlags), std::move(InitSymbol))); + new BasicObjectLayerMaterializationUnit(L, std::move(O), + std::move(*ObjInterface))); } BasicObjectLayerMaterializationUnit::BasicObjectLayerMaterializationUnit( - ObjectLayer &L, std::unique_ptr<MemoryBuffer> O, SymbolFlagsMap SymbolFlags, - SymbolStringPtr InitSymbol) - : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol)), L(L), - O(std::move(O)) {} + ObjectLayer &L, std::unique_ptr<MemoryBuffer> O, Interface I) + : MaterializationUnit(std::move(I)), L(L), O(std::move(O)) {} StringRef BasicObjectLayerMaterializationUnit::getName() const { if (O) diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp index e1f494415e86..66453e6a632f 100644 --- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp @@ -144,7 +144,7 @@ createLocalLazyCallThroughManager(const Triple &T, ExecutionSession &ES, LazyReexportsMaterializationUnit::LazyReexportsMaterializationUnit( LazyCallThroughManager &LCTManager, IndirectStubsManager &ISManager, JITDylib &SourceJD, SymbolAliasMap CallableAliases, ImplSymbolMap *SrcJDLoc) - : MaterializationUnit(extractFlags(CallableAliases), nullptr), + : MaterializationUnit(extractFlags(CallableAliases)), LCTManager(LCTManager), ISManager(ISManager), SourceJD(SourceJD), CallableAliases(std::move(CallableAliases)), AliaseeTable(SrcJDLoc) {} @@ -219,7 +219,7 @@ void LazyReexportsMaterializationUnit::discard(const JITDylib &JD, CallableAliases.erase(Name); } -SymbolFlagsMap +MaterializationUnit::Interface LazyReexportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) { SymbolFlagsMap SymbolFlags; for (auto &KV : Aliases) { @@ -227,7 +227,7 @@ LazyReexportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) { "Lazy re-exports must be callable symbols"); SymbolFlags[KV.first] = KV.second.AliasFlags; } - return SymbolFlags; + return MaterializationUnit::Interface(std::move(SymbolFlags), nullptr); } } // End namespace orc. diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 46c915dfea9e..fb2e90e1c9c5 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -28,8 +28,7 @@ class MachOHeaderMaterializationUnit : public MaterializationUnit { public: MachOHeaderMaterializationUnit(MachOPlatform &MOP, const SymbolStringPtr &HeaderStartSymbol) - : MaterializationUnit(createHeaderSymbols(MOP, HeaderStartSymbol), - HeaderStartSymbol), + : MaterializationUnit(createHeaderInterface(MOP, HeaderStartSymbol)), MOP(MOP) {} StringRef getName() const override { return "MachOHeaderMU"; } @@ -110,9 +109,9 @@ private: return G.createContentBlock(HeaderSection, HeaderContent, 0, 8, 0); } - static SymbolFlagsMap - createHeaderSymbols(MachOPlatform &MOP, - const SymbolStringPtr &HeaderStartSymbol) { + static MaterializationUnit::Interface + createHeaderInterface(MachOPlatform &MOP, + const SymbolStringPtr &HeaderStartSymbol) { SymbolFlagsMap HeaderSymbolFlags; HeaderSymbolFlags[HeaderStartSymbol] = JITSymbolFlags::Exported; @@ -120,7 +119,8 @@ private: HeaderSymbolFlags[MOP.getExecutionSession().intern(HS.Name)] = JITSymbolFlags::Exported; - return HeaderSymbolFlags; + return MaterializationUnit::Interface(std::move(HeaderSymbolFlags), + HeaderStartSymbol); } MachOPlatform &MOP; diff --git a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp index 7b21e6a684ca..9c243c9bf1d2 100644 --- a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp @@ -7,13 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/Mangling.h" -#include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h" -#include "llvm/ExecutionEngine/Orc/MachOPlatform.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Mangler.h" -#include "llvm/Object/ELFObjectFile.h" -#include "llvm/Object/MachO.h" -#include "llvm/Object/ObjectFile.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "orc" @@ -85,188 +80,5 @@ void IRSymbolMapper::add(ExecutionSession &ES, const ManglingOptions &MO, } } -static SymbolStringPtr addInitSymbol(SymbolFlagsMap &SymbolFlags, - ExecutionSession &ES, - StringRef ObjFileName) { - SymbolStringPtr InitSymbol; - size_t Counter = 0; - - do { - std::string InitSymString; - raw_string_ostream(InitSymString) - << "$." << ObjFileName << ".__inits." << Counter++; - InitSymbol = ES.intern(InitSymString); - } while (SymbolFlags.count(InitSymbol)); - - SymbolFlags[InitSymbol] = JITSymbolFlags::MaterializationSideEffectsOnly; - return InitSymbol; -} - -static Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>> -getMachOObjectFileSymbolInfo(ExecutionSession &ES, - const object::MachOObjectFile &Obj) { - SymbolFlagsMap SymbolFlags; - - for (auto &Sym : Obj.symbols()) { - Expected<uint32_t> SymFlagsOrErr = Sym.getFlags(); - if (!SymFlagsOrErr) - // TODO: Test this error. - return SymFlagsOrErr.takeError(); - - // Skip symbols not defined in this object file. - if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) - continue; - - // Skip symbols that are not global. - if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) - continue; - - // Skip symbols that have type SF_File. - if (auto SymType = Sym.getType()) { - if (*SymType == object::SymbolRef::ST_File) - continue; - } else - return SymType.takeError(); - - auto Name = Sym.getName(); - if (!Name) - return Name.takeError(); - auto InternedName = ES.intern(*Name); - auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym); - if (!SymFlags) - return SymFlags.takeError(); - - // Strip the 'exported' flag from MachO linker-private symbols. - if (Name->startswith("l")) - *SymFlags &= ~JITSymbolFlags::Exported; - - SymbolFlags[InternedName] = std::move(*SymFlags); - } - - SymbolStringPtr InitSymbol; - for (auto &Sec : Obj.sections()) { - auto SecType = Obj.getSectionType(Sec); - if ((SecType & MachO::SECTION_TYPE) == MachO::S_MOD_INIT_FUNC_POINTERS) { - InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName()); - break; - } - auto SegName = Obj.getSectionFinalSegmentName(Sec.getRawDataRefImpl()); - auto SecName = cantFail(Obj.getSectionName(Sec.getRawDataRefImpl())); - if (MachOPlatform::isInitializerSection(SegName, SecName)) { - InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName()); - break; - } - } - - return std::make_pair(std::move(SymbolFlags), std::move(InitSymbol)); -} - -static Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>> -getELFObjectFileSymbolInfo(ExecutionSession &ES, - const object::ELFObjectFileBase &Obj) { - SymbolFlagsMap SymbolFlags; - for (auto &Sym : Obj.symbols()) { - Expected<uint32_t> SymFlagsOrErr = Sym.getFlags(); - if (!SymFlagsOrErr) - // TODO: Test this error. - return SymFlagsOrErr.takeError(); - - // Skip symbols not defined in this object file. - if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) - continue; - - // Skip symbols that are not global. - if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) - continue; - - // Skip symbols that have type SF_File. - if (auto SymType = Sym.getType()) { - if (*SymType == object::SymbolRef::ST_File) - continue; - } else - return SymType.takeError(); - - auto Name = Sym.getName(); - if (!Name) - return Name.takeError(); - auto InternedName = ES.intern(*Name); - auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym); - if (!SymFlags) - return SymFlags.takeError(); - - // ELF STB_GNU_UNIQUE should map to Weak for ORC. - if (Sym.getBinding() == ELF::STB_GNU_UNIQUE) - *SymFlags |= JITSymbolFlags::Weak; - - SymbolFlags[InternedName] = std::move(*SymFlags); - } - - SymbolStringPtr InitSymbol; - for (auto &Sec : Obj.sections()) { - if (auto SecName = Sec.getName()) { - if (ELFNixPlatform::isInitializerSection(*SecName)) { - InitSymbol = addInitSymbol(SymbolFlags, ES, Obj.getFileName()); - break; - } - } - } - - return std::make_pair(std::move(SymbolFlags), InitSymbol); -} - -Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>> -getGenericObjectFileSymbolInfo(ExecutionSession &ES, - const object::ObjectFile &Obj) { - SymbolFlagsMap SymbolFlags; - for (auto &Sym : Obj.symbols()) { - Expected<uint32_t> SymFlagsOrErr = Sym.getFlags(); - if (!SymFlagsOrErr) - // TODO: Test this error. - return SymFlagsOrErr.takeError(); - - // Skip symbols not defined in this object file. - if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) - continue; - - // Skip symbols that are not global. - if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) - continue; - - // Skip symbols that have type SF_File. - if (auto SymType = Sym.getType()) { - if (*SymType == object::SymbolRef::ST_File) - continue; - } else - return SymType.takeError(); - - auto Name = Sym.getName(); - if (!Name) - return Name.takeError(); - auto InternedName = ES.intern(*Name); - auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym); - if (!SymFlags) - return SymFlags.takeError(); - - SymbolFlags[InternedName] = std::move(*SymFlags); - } - - return std::make_pair(std::move(SymbolFlags), nullptr); -} - -Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>> -getObjectSymbolInfo(ExecutionSession &ES, MemoryBufferRef ObjBuffer) { - auto Obj = object::ObjectFile::createObjectFile(ObjBuffer); - - if (!Obj) - return Obj.takeError(); - - if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj->get())) - return getMachOObjectFileSymbolInfo(ES, *MachOObj); - else if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj->get())) - return getELFObjectFileSymbolInfo(ES, *ELFObj); - - return getGenericObjectFileSymbolInfo(ES, **Obj); -} - } // End namespace orc. } // End namespace llvm. diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp new file mode 100644 index 000000000000..c1ad569dd65d --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp @@ -0,0 +1,205 @@ +//===------ ObjectFileInterface.cpp - MU interface utils for objects ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h" +#include "llvm/ExecutionEngine/Orc/ELFNixPlatform.h" +#include "llvm/ExecutionEngine/Orc/MachOPlatform.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/MachO.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "orc" + +namespace llvm { +namespace orc { + +void addInitSymbol(MaterializationUnit::Interface &I, ExecutionSession &ES, + StringRef ObjFileName) { + assert(!I.InitSymbol && "I already has an init symbol"); + size_t Counter = 0; + + do { + std::string InitSymString; + raw_string_ostream(InitSymString) + << "$." << ObjFileName << ".__inits." << Counter++; + I.InitSymbol = ES.intern(InitSymString); + } while (I.SymbolFlags.count(I.InitSymbol)); + + I.SymbolFlags[I.InitSymbol] = JITSymbolFlags::MaterializationSideEffectsOnly; +} + +static Expected<MaterializationUnit::Interface> +getMachOObjectFileSymbolInfo(ExecutionSession &ES, + const object::MachOObjectFile &Obj) { + MaterializationUnit::Interface I; + + for (auto &Sym : Obj.symbols()) { + Expected<uint32_t> SymFlagsOrErr = Sym.getFlags(); + if (!SymFlagsOrErr) + // TODO: Test this error. + return SymFlagsOrErr.takeError(); + + // Skip symbols not defined in this object file. + if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) + continue; + + // Skip symbols that are not global. + if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) + continue; + + // Skip symbols that have type SF_File. + if (auto SymType = Sym.getType()) { + if (*SymType == object::SymbolRef::ST_File) + continue; + } else + return SymType.takeError(); + + auto Name = Sym.getName(); + if (!Name) + return Name.takeError(); + auto InternedName = ES.intern(*Name); + auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym); + if (!SymFlags) + return SymFlags.takeError(); + + // Strip the 'exported' flag from MachO linker-private symbols. + if (Name->startswith("l")) + *SymFlags &= ~JITSymbolFlags::Exported; + + I.SymbolFlags[InternedName] = std::move(*SymFlags); + } + + for (auto &Sec : Obj.sections()) { + auto SecType = Obj.getSectionType(Sec); + if ((SecType & MachO::SECTION_TYPE) == MachO::S_MOD_INIT_FUNC_POINTERS) { + addInitSymbol(I, ES, Obj.getFileName()); + break; + } + auto SegName = Obj.getSectionFinalSegmentName(Sec.getRawDataRefImpl()); + auto SecName = cantFail(Obj.getSectionName(Sec.getRawDataRefImpl())); + if (MachOPlatform::isInitializerSection(SegName, SecName)) { + addInitSymbol(I, ES, Obj.getFileName()); + break; + } + } + + return I; +} + +static Expected<MaterializationUnit::Interface> +getELFObjectFileSymbolInfo(ExecutionSession &ES, + const object::ELFObjectFileBase &Obj) { + MaterializationUnit::Interface I; + + for (auto &Sym : Obj.symbols()) { + Expected<uint32_t> SymFlagsOrErr = Sym.getFlags(); + if (!SymFlagsOrErr) + // TODO: Test this error. + return SymFlagsOrErr.takeError(); + + // Skip symbols not defined in this object file. + if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) + continue; + + // Skip symbols that are not global. + if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) + continue; + + // Skip symbols that have type SF_File. + if (auto SymType = Sym.getType()) { + if (*SymType == object::SymbolRef::ST_File) + continue; + } else + return SymType.takeError(); + + auto Name = Sym.getName(); + if (!Name) + return Name.takeError(); + auto InternedName = ES.intern(*Name); + auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym); + if (!SymFlags) + return SymFlags.takeError(); + + // ELF STB_GNU_UNIQUE should map to Weak for ORC. + if (Sym.getBinding() == ELF::STB_GNU_UNIQUE) + *SymFlags |= JITSymbolFlags::Weak; + + I.SymbolFlags[InternedName] = std::move(*SymFlags); + } + + SymbolStringPtr InitSymbol; + for (auto &Sec : Obj.sections()) { + if (auto SecName = Sec.getName()) { + if (ELFNixPlatform::isInitializerSection(*SecName)) { + addInitSymbol(I, ES, Obj.getFileName()); + break; + } + } + } + + return I; +} + +Expected<MaterializationUnit::Interface> +getGenericObjectFileSymbolInfo(ExecutionSession &ES, + const object::ObjectFile &Obj) { + MaterializationUnit::Interface I; + + for (auto &Sym : Obj.symbols()) { + Expected<uint32_t> SymFlagsOrErr = Sym.getFlags(); + if (!SymFlagsOrErr) + // TODO: Test this error. + return SymFlagsOrErr.takeError(); + + // Skip symbols not defined in this object file. + if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined) + continue; + + // Skip symbols that are not global. + if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) + continue; + + // Skip symbols that have type SF_File. + if (auto SymType = Sym.getType()) { + if (*SymType == object::SymbolRef::ST_File) + continue; + } else + return SymType.takeError(); + + auto Name = Sym.getName(); + if (!Name) + return Name.takeError(); + auto InternedName = ES.intern(*Name); + auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym); + if (!SymFlags) + return SymFlags.takeError(); + + I.SymbolFlags[InternedName] = std::move(*SymFlags); + } + + return I; +} + +Expected<MaterializationUnit::Interface> +getObjectFileInterface(ExecutionSession &ES, MemoryBufferRef ObjBuffer) { + auto Obj = object::ObjectFile::createObjectFile(ObjBuffer); + + if (!Obj) + return Obj.takeError(); + + if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(Obj->get())) + return getMachOObjectFileSymbolInfo(ES, *MachOObj); + else if (auto *ELFObj = dyn_cast<object::ELFObjectFileBase>(Obj->get())) + return getELFObjectFileSymbolInfo(ES, *ELFObj); + + return getGenericObjectFileSymbolInfo(ES, **Obj); +} + +} // End namespace orc. +} // End namespace llvm. diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp index 6f840a079dd1..0d6a33c5685e 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp @@ -23,12 +23,6 @@ using namespace llvm::orc; namespace { class LinkGraphMaterializationUnit : public MaterializationUnit { -private: - struct LinkGraphInterface { - SymbolFlagsMap SymbolFlags; - SymbolStringPtr InitSymbol; - }; - public: static std::unique_ptr<LinkGraphMaterializationUnit> Create(ObjectLinkingLayer &ObjLinkingLayer, std::unique_ptr<LinkGraph> G) { @@ -44,9 +38,9 @@ public: } private: - static LinkGraphInterface scanLinkGraph(ExecutionSession &ES, LinkGraph &G) { + static Interface scanLinkGraph(ExecutionSession &ES, LinkGraph &G) { - LinkGraphInterface LGI; + Interface LGI; for (auto *Sym : G.defined_symbols()) { // Skip local symbols. @@ -98,11 +92,9 @@ private: } LinkGraphMaterializationUnit(ObjectLinkingLayer &ObjLinkingLayer, - std::unique_ptr<LinkGraph> G, - LinkGraphInterface LGI) - : MaterializationUnit(std::move(LGI.SymbolFlags), - std::move(LGI.InitSymbol)), - ObjLinkingLayer(ObjLinkingLayer), G(std::move(G)) {} + std::unique_ptr<LinkGraph> G, Interface LGI) + : MaterializationUnit(std::move(LGI)), ObjLinkingLayer(ObjLinkingLayer), + G(std::move(G)) {} void discard(const JITDylib &JD, const SymbolStringPtr &Name) override { for (auto *Sym : G->defined_symbols()) @@ -257,7 +249,8 @@ public: { - // Check that InternedResult matches up with MR->getSymbols(). + // Check that InternedResult matches up with MR->getSymbols(), overriding + // flags if requested. // This guards against faulty transformations / compilers / object caches. // First check that there aren't any missing symbols. @@ -266,16 +259,20 @@ public: SymbolNameVector MissingSymbols; for (auto &KV : MR->getSymbols()) { + auto I = InternedResult.find(KV.first); + // If this is a materialization-side-effects only symbol then bump // the counter and make sure it's *not* defined, otherwise make // sure that it is defined. if (KV.second.hasMaterializationSideEffectsOnly()) { ++NumMaterializationSideEffectsOnlySymbols; - if (InternedResult.count(KV.first)) + if (I != InternedResult.end()) ExtraSymbols.push_back(KV.first); continue; - } else if (!InternedResult.count(KV.first)) + } else if (I == InternedResult.end()) MissingSymbols.push_back(KV.first); + else if (Layer.OverrideObjectFlags) + I->second.setFlags(KV.second); } // If there were missing symbols then report the error. diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp index 673f7394450f..77a8f5af8ba0 100644 --- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp +++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp @@ -192,8 +192,8 @@ public: LLVMOrcMaterializationUnitMaterializeFunction Materialize, LLVMOrcMaterializationUnitDiscardFunction Discard, LLVMOrcMaterializationUnitDestroyFunction Destroy) - : llvm::orc::MaterializationUnit(std::move(InitialSymbolFlags), - std::move(InitSymbol)), + : llvm::orc::MaterializationUnit( + Interface(std::move(InitialSymbolFlags), std::move(InitSymbol))), Name(std::move(Name)), Ctx(Ctx), Materialize(Materialize), Discard(Discard), Destroy(Destroy) {} diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index f16c6bdbfa4f..3f38d26869d4 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -124,8 +124,10 @@ void RuntimeDyldImpl::resolveRelocations() { std::lock_guard<sys::Mutex> locked(lock); // Print out the sections prior to relocation. - LLVM_DEBUG(for (int i = 0, e = Sections.size(); i != e; ++i) - dumpSectionMemory(Sections[i], "before relocations");); + LLVM_DEBUG({ + for (SectionEntry &S : Sections) + dumpSectionMemory(S, "before relocations"); + }); // First, resolve relocations associated with external symbols. if (auto Err = resolveExternalSymbols()) { @@ -136,21 +138,23 @@ void RuntimeDyldImpl::resolveRelocations() { resolveLocalRelocations(); // Print out sections after relocation. - LLVM_DEBUG(for (int i = 0, e = Sections.size(); i != e; ++i) - dumpSectionMemory(Sections[i], "after relocations");); + LLVM_DEBUG({ + for (SectionEntry &S : Sections) + dumpSectionMemory(S, "after relocations"); + }); } void RuntimeDyldImpl::resolveLocalRelocations() { // Iterate over all outstanding relocations - for (auto it = Relocations.begin(), e = Relocations.end(); it != e; ++it) { + for (const auto &Rel : Relocations) { // The Section here (Sections[i]) refers to the section in which the // symbol for the relocation is located. The SectionID in the relocation // entry provides the section to which the relocation will be applied. - unsigned Idx = it->first; + unsigned Idx = Rel.first; uint64_t Addr = getSectionLoadAddress(Idx); LLVM_DEBUG(dbgs() << "Resolving relocations Section #" << Idx << "\t" << format("%p", (uintptr_t)Addr) << "\n"); - resolveRelocationList(it->second, Addr); + resolveRelocationList(Rel.second, Addr); } Relocations.clear(); } @@ -457,9 +461,9 @@ static uint64_t computeAllocationSizeForSections(std::vector<uint64_t> &SectionSizes, uint64_t Alignment) { uint64_t TotalSize = 0; - for (size_t Idx = 0, Cnt = SectionSizes.size(); Idx < Cnt; Idx++) { + for (uint64_t SectionSize : SectionSizes) { uint64_t AlignedSize = - (SectionSizes[Idx] + Alignment - 1) / Alignment * Alignment; + (SectionSize + Alignment - 1) / Alignment * Alignment; TotalSize += AlignedSize; } return TotalSize; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 18f1a2314853..5157d51fd18c 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -996,7 +996,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSections( Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator()); AllocaIP = Builder.saveIP(); InsertPointTy AfterIP = - applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, true); + applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait); BasicBlock *LoopAfterBB = AfterIP.getBlock(); Instruction *SplitPos = LoopAfterBB->getTerminator(); if (!isa_and_nonnull<BranchInst>(SplitPos)) @@ -1156,7 +1156,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( Builder.SetInsertPoint(NonAtomicRedBlock); for (auto En : enumerate(ReductionInfos)) { const ReductionInfo &RI = En.value(); - Type *ValueType = RI.getElementType(); + Type *ValueType = RI.ElementType; Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable, "red.value." + Twine(En.index())); Value *PrivateRedValue = @@ -1181,8 +1181,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( Builder.SetInsertPoint(AtomicRedBlock); if (CanGenerateAtomic) { for (const ReductionInfo &RI : ReductionInfos) { - Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.Variable, - RI.PrivateVariable)); + Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType, + RI.Variable, RI.PrivateVariable)); if (!Builder.GetInsertBlock()) return InsertPointTy(); } @@ -1207,13 +1207,13 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions( RedArrayTy, LHSArrayPtr, 0, En.index()); Value *LHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), LHSI8PtrPtr); Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType()); - Value *LHS = Builder.CreateLoad(RI.getElementType(), LHSPtr); + Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( RedArrayTy, RHSArrayPtr, 0, En.index()); Value *RHSI8Ptr = Builder.CreateLoad(Builder.getInt8PtrTy(), RHSI8PtrPtr); Value *RHSPtr = Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType()); - Value *RHS = Builder.CreateLoad(RI.getElementType(), RHSPtr); + Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); Value *Reduced; Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced)); if (!Builder.GetInsertBlock()) @@ -1329,13 +1329,10 @@ CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton( LoopInfos.emplace_front(); CanonicalLoopInfo *CL = &LoopInfos.front(); - CL->Preheader = Preheader; CL->Header = Header; CL->Cond = Cond; - CL->Body = Body; CL->Latch = Latch; CL->Exit = Exit; - CL->After = After; #ifndef NDEBUG CL->assertOK(); @@ -1359,7 +1356,7 @@ OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc, // Split the loop at the insertion point: Branch to the preheader and move // every following instruction to after the loop (the After BB). Also, the // new successor is the loop's after block. - Builder.CreateBr(CL->Preheader); + Builder.CreateBr(CL->getPreheader()); After->getInstList().splice(After->begin(), BB->getInstList(), Builder.GetInsertPoint(), BB->end()); After->replaceSuccessorsPhiUsesWith(BB, After); @@ -1791,6 +1788,12 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, BasicBlock *OrigAfter = Outermost->getAfter(); Function *F = OrigPreheader->getParent(); + // Loop control blocks that may become orphaned later. + SmallVector<BasicBlock *, 12> OldControlBBs; + OldControlBBs.reserve(6 * Loops.size()); + for (CanonicalLoopInfo *Loop : Loops) + Loop->collectControlBlocks(OldControlBBs); + // Setup the IRBuilder for inserting the trip count computation. Builder.SetCurrentDebugLocation(DL); if (ComputeIP.isSet()) @@ -1828,7 +1831,7 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, Value *Leftover = Result->getIndVar(); SmallVector<Value *> NewIndVars; - NewIndVars.set_size(NumLoops); + NewIndVars.resize(NumLoops); for (int i = NumLoops - 1; i >= 1; --i) { Value *OrigTripCount = Loops[i]->getTripCount(); @@ -1886,10 +1889,6 @@ OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]); // Remove unused parts of the input loops. - SmallVector<BasicBlock *, 12> OldControlBBs; - OldControlBBs.reserve(6 * Loops.size()); - for (CanonicalLoopInfo *Loop : Loops) - Loop->collectControlBlocks(OldControlBBs); removeUnusedBlocksFromParent(OldControlBBs); for (CanonicalLoopInfo *L : Loops) @@ -1915,6 +1914,12 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, BasicBlock *InnerEnter = InnermostLoop->getBody(); BasicBlock *InnerLatch = InnermostLoop->getLatch(); + // Loop control blocks that may become orphaned later. + SmallVector<BasicBlock *, 12> OldControlBBs; + OldControlBBs.reserve(6 * Loops.size()); + for (CanonicalLoopInfo *Loop : Loops) + Loop->collectControlBlocks(OldControlBBs); + // Collect original trip counts and induction variable to be accessible by // index. Also, the structure of the original loops is not preserved during // the construction of the tiled loops, so do it before we scavenge the BBs of @@ -2074,10 +2079,6 @@ OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops, } // Remove unused parts of the original loops. - SmallVector<BasicBlock *, 12> OldControlBBs; - OldControlBBs.reserve(6 * Loops.size()); - for (CanonicalLoopInfo *Loop : Loops) - Loop->collectControlBlocks(OldControlBBs); removeUnusedBlocksFromParent(OldControlBBs); for (CanonicalLoopInfo *L : Loops) @@ -3079,7 +3080,7 @@ OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate( const LocationDescription &Loc, Instruction *AllocIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, - AtomicUpdateCallbackTy &UpdateOp, bool IsXLHSInRHSPart) { + AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) { if (!updateToLocation(Loc)) return Loc.IP; @@ -3097,7 +3098,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicUpdate( }); emitAtomicUpdate(AllocIP, X.Var, Expr, AO, RMWOp, UpdateOp, X.IsVolatile, - IsXLHSInRHSPart); + IsXBinopExpr); checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update); return Builder.saveIP(); } @@ -3134,13 +3135,13 @@ std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(Instruction *AllocIP, Value *X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, - bool VolatileX, bool IsXLHSInRHSPart) { + bool VolatileX, bool IsXBinopExpr) { Type *XElemTy = X->getType()->getPointerElementType(); bool DoCmpExch = ((RMWOp == AtomicRMWInst::BAD_BINOP) || (RMWOp == AtomicRMWInst::FAdd)) || (RMWOp == AtomicRMWInst::FSub) || - (RMWOp == AtomicRMWInst::Sub && !IsXLHSInRHSPart); + (RMWOp == AtomicRMWInst::Sub && !IsXBinopExpr); std::pair<Value *, Value *> Res; if (XElemTy->isIntegerTy() && !DoCmpExch) { @@ -3232,7 +3233,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture( const LocationDescription &Loc, Instruction *AllocIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, - bool UpdateExpr, bool IsPostfixUpdate, bool IsXLHSInRHSPart) { + bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) { if (!updateToLocation(Loc)) return Loc.IP; @@ -3251,9 +3252,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCapture( // If UpdateExpr is 'x' updated with some `expr` not based on 'x', // 'x' is simply atomically rewritten with 'expr'. AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg); - std::pair<Value *, Value *> Result = - emitAtomicUpdate(AllocIP, X.Var, Expr, AO, AtomicOp, UpdateOp, - X.IsVolatile, IsXLHSInRHSPart); + std::pair<Value *, Value *> Result = emitAtomicUpdate( + AllocIP, X.Var, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile, IsXBinopExpr); Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second); Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile); @@ -3321,7 +3321,16 @@ void CanonicalLoopInfo::collectControlBlocks( // flow. For consistency, this also means we do not add the Body block, which // is just the entry to the body code. BBs.reserve(BBs.size() + 6); - BBs.append({Preheader, Header, Cond, Latch, Exit, After}); + BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()}); +} + +BasicBlock *CanonicalLoopInfo::getPreheader() const { + assert(isValid() && "Requires a valid canonical loop"); + for (BasicBlock *Pred : predecessors(Header)) { + if (Pred != Latch) + return Pred; + } + llvm_unreachable("Missing preheader"); } void CanonicalLoopInfo::assertOK() const { @@ -3330,6 +3339,10 @@ void CanonicalLoopInfo::assertOK() const { if (!isValid()) return; + BasicBlock *Preheader = getPreheader(); + BasicBlock *Body = getBody(); + BasicBlock *After = getAfter(); + // Verify standard control-flow we use for OpenMP loops. assert(Preheader); assert(isa<BranchInst>(Preheader->getTerminator()) && @@ -3415,11 +3428,8 @@ void CanonicalLoopInfo::assertOK() const { } void CanonicalLoopInfo::invalidate() { - Preheader = nullptr; Header = nullptr; Cond = nullptr; - Body = nullptr; Latch = nullptr; Exit = nullptr; - After = nullptr; } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index c9748e1387eb..bbe0c97e60a2 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -512,10 +512,8 @@ void TypePrinting::incorporateTypes() { // the unnamed ones out to a numbering and remove the anonymous structs. unsigned NextNumber = 0; - std::vector<StructType*>::iterator NextToUse = NamedTypes.begin(), I, E; - for (I = NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I) { - StructType *STy = *I; - + std::vector<StructType *>::iterator NextToUse = NamedTypes.begin(); + for (StructType *STy : NamedTypes) { // Ignore anonymous types. if (STy->isLiteral()) continue; @@ -1450,6 +1448,12 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, return; } + if (const auto *NC = dyn_cast<NoCFIValue>(CV)) { + Out << "no_cfi "; + WriteAsOperandInternal(Out, NC->getGlobalValue(), WriterCtx); + return; + } + if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) { Type *ETy = CA->getType()->getElementType(); Out << '['; @@ -1583,11 +1587,9 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV, Out << ", "; } - if (CE->hasIndices()) { - ArrayRef<unsigned> Indices = CE->getIndices(); - for (unsigned i = 0, e = Indices.size(); i != e; ++i) - Out << ", " << Indices[i]; - } + if (CE->hasIndices()) + for (unsigned I : CE->getIndices()) + Out << ", " << I; if (CE->isCast()) { Out << " to "; @@ -3528,8 +3530,8 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) { } maybePrintComdat(Out, *GV); - if (GV->getAlignment()) - Out << ", align " << GV->getAlignment(); + if (MaybeAlign A = GV->getAlign()) + Out << ", align " << A->value(); SmallVector<std::pair<unsigned, MDNode *>, 4> MDs; GV->getAllMetadata(MDs); @@ -3637,13 +3639,13 @@ void AssemblyWriter::printTypeIdentities() { } auto &NamedTypes = TypePrinter.getNamedTypes(); - for (unsigned I = 0, E = NamedTypes.size(); I != E; ++I) { - PrintLLVMName(Out, NamedTypes[I]->getName(), LocalPrefix); + for (StructType *NamedType : NamedTypes) { + PrintLLVMName(Out, NamedType->getName(), LocalPrefix); Out << " = type "; // Make sure we print out at least one level of the type structure, so // that we do not get %FILE = type %FILE - TypePrinter.printStructBody(NamedTypes[I], Out); + TypePrinter.printStructBody(NamedType, Out); Out << '\n'; } } @@ -3757,8 +3759,8 @@ void AssemblyWriter::printFunction(const Function *F) { Out << '"'; } maybePrintComdat(Out, *F); - if (F->getAlignment()) - Out << " align " << F->getAlignment(); + if (MaybeAlign A = F->getAlign()) + Out << " align " << A->value(); if (F->hasGC()) Out << " gc \"" << F->getGC() << '"'; if (F->hasPrefixData()) { @@ -4239,8 +4241,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) { Out << ", "; writeOperand(AI->getArraySize(), true); } - if (AI->getAlignment()) { - Out << ", align " << AI->getAlignment(); + if (MaybeAlign A = AI->getAlign()) { + Out << ", align " << A->value(); } unsigned AddrSpace = AI->getType()->getAddressSpace(); @@ -4310,13 +4312,13 @@ void AssemblyWriter::printInstruction(const Instruction &I) { if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (LI->isAtomic()) writeAtomic(LI->getContext(), LI->getOrdering(), LI->getSyncScopeID()); - if (LI->getAlignment()) - Out << ", align " << LI->getAlignment(); + if (MaybeAlign A = LI->getAlign()) + Out << ", align " << A->value(); } else if (const StoreInst *SI = dyn_cast<StoreInst>(&I)) { if (SI->isAtomic()) writeAtomic(SI->getContext(), SI->getOrdering(), SI->getSyncScopeID()); - if (SI->getAlignment()) - Out << ", align " << SI->getAlignment(); + if (MaybeAlign A = SI->getAlign()) + Out << ", align " << A->value(); } else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&I)) { writeAtomicCmpXchg(CXI->getContext(), CXI->getSuccessOrdering(), CXI->getFailureOrdering(), CXI->getSyncScopeID()); diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h index c5bbe6571096..1153fb827b56 100644 --- a/llvm/lib/IR/AttributeImpl.h +++ b/llvm/lib/IR/AttributeImpl.h @@ -253,7 +253,8 @@ public: uint64_t getDereferenceableBytes() const; uint64_t getDereferenceableOrNullBytes() const; std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const; - std::pair<unsigned, unsigned> getVScaleRangeArgs() const; + unsigned getVScaleRangeMin() const; + Optional<unsigned> getVScaleRangeMax() const; std::string getAsString(bool InAttrGrp) const; Type *getAttributeType(Attribute::AttrKind Kind) const; diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index f81a446d6e46..c899afae6cce 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -78,15 +78,18 @@ unpackAllocSizeArgs(uint64_t Num) { return std::make_pair(ElemSizeArg, NumElemsArg); } -static uint64_t packVScaleRangeArgs(unsigned MinValue, unsigned MaxValue) { - return uint64_t(MinValue) << 32 | MaxValue; +static uint64_t packVScaleRangeArgs(unsigned MinValue, + Optional<unsigned> MaxValue) { + return uint64_t(MinValue) << 32 | MaxValue.getValueOr(0); } -static std::pair<unsigned, unsigned> unpackVScaleRangeArgs(uint64_t Value) { +static std::pair<unsigned, Optional<unsigned>> +unpackVScaleRangeArgs(uint64_t Value) { unsigned MaxValue = Value & std::numeric_limits<unsigned>::max(); unsigned MinValue = Value >> 32; - return std::make_pair(MinValue, MaxValue); + return std::make_pair(MinValue, + MaxValue > 0 ? MaxValue : Optional<unsigned>()); } Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind, @@ -354,10 +357,16 @@ std::pair<unsigned, Optional<unsigned>> Attribute::getAllocSizeArgs() const { return unpackAllocSizeArgs(pImpl->getValueAsInt()); } -std::pair<unsigned, unsigned> Attribute::getVScaleRangeArgs() const { +unsigned Attribute::getVScaleRangeMin() const { + assert(hasAttribute(Attribute::VScaleRange) && + "Trying to get vscale args from non-vscale attribute"); + return unpackVScaleRangeArgs(pImpl->getValueAsInt()).first; +} + +Optional<unsigned> Attribute::getVScaleRangeMax() const { assert(hasAttribute(Attribute::VScaleRange) && "Trying to get vscale args from non-vscale attribute"); - return unpackVScaleRangeArgs(pImpl->getValueAsInt()); + return unpackVScaleRangeArgs(pImpl->getValueAsInt()).second; } std::string Attribute::getAsString(bool InAttrGrp) const { @@ -428,13 +437,13 @@ std::string Attribute::getAsString(bool InAttrGrp) const { } if (hasAttribute(Attribute::VScaleRange)) { - unsigned MinValue, MaxValue; - std::tie(MinValue, MaxValue) = getVScaleRangeArgs(); + unsigned MinValue = getVScaleRangeMin(); + Optional<unsigned> MaxValue = getVScaleRangeMax(); std::string Result = "vscale_range("; Result += utostr(MinValue); Result += ','; - Result += utostr(MaxValue); + Result += utostr(MaxValue.getValueOr(0)); Result += ')'; return Result; } @@ -717,9 +726,12 @@ std::pair<unsigned, Optional<unsigned>> AttributeSet::getAllocSizeArgs() const { : std::pair<unsigned, Optional<unsigned>>(0, 0); } -std::pair<unsigned, unsigned> AttributeSet::getVScaleRangeArgs() const { - return SetNode ? SetNode->getVScaleRangeArgs() - : std::pair<unsigned, unsigned>(0, 0); +unsigned AttributeSet::getVScaleRangeMin() const { + return SetNode ? SetNode->getVScaleRangeMin() : 1; +} + +Optional<unsigned> AttributeSet::getVScaleRangeMax() const { + return SetNode ? SetNode->getVScaleRangeMax() : None; } std::string AttributeSet::getAsString(bool InAttrGrp) const { @@ -897,10 +909,16 @@ AttributeSetNode::getAllocSizeArgs() const { return std::make_pair(0, 0); } -std::pair<unsigned, unsigned> AttributeSetNode::getVScaleRangeArgs() const { +unsigned AttributeSetNode::getVScaleRangeMin() const { if (auto A = findEnumAttribute(Attribute::VScaleRange)) - return A->getVScaleRangeArgs(); - return std::make_pair(0, 0); + return A->getVScaleRangeMin(); + return 1; +} + +Optional<unsigned> AttributeSetNode::getVScaleRangeMax() const { + if (auto A = findEnumAttribute(Attribute::VScaleRange)) + return A->getVScaleRangeMax(); + return None; } std::string AttributeSetNode::getAsString(bool InAttrGrp) const { @@ -1118,16 +1136,21 @@ AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs, } AttributeList AttributeList::get(LLVMContext &C, unsigned Index, - const AttrBuilder &B) { - if (!B.hasAttributes()) + AttributeSet Attrs) { + if (!Attrs.hasAttributes()) return {}; Index = attrIdxToArrayIdx(Index); SmallVector<AttributeSet, 8> AttrSets(Index + 1); - AttrSets[Index] = AttributeSet::get(C, B); + AttrSets[Index] = Attrs; return getImpl(C, AttrSets); } AttributeList AttributeList::get(LLVMContext &C, unsigned Index, + const AttrBuilder &B) { + return get(C, Index, AttributeSet::get(C, B)); +} + +AttributeList AttributeList::get(LLVMContext &C, unsigned Index, ArrayRef<Attribute::AttrKind> Kinds) { SmallVector<std::pair<unsigned, Attribute>, 8> Attrs; for (const auto K : Kinds) @@ -1623,8 +1646,12 @@ std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const { return unpackAllocSizeArgs(getRawIntAttr(Attribute::AllocSize)); } -std::pair<unsigned, unsigned> AttrBuilder::getVScaleRangeArgs() const { - return unpackVScaleRangeArgs(getRawIntAttr(Attribute::VScaleRange)); +unsigned AttrBuilder::getVScaleRangeMin() const { + return unpackVScaleRangeArgs(getRawIntAttr(Attribute::VScaleRange)).first; +} + +Optional<unsigned> AttrBuilder::getVScaleRangeMax() const { + return unpackVScaleRangeArgs(getRawIntAttr(Attribute::VScaleRange)).second; } AttrBuilder &AttrBuilder::addAlignmentAttr(MaybeAlign Align) { @@ -1669,7 +1696,7 @@ AttrBuilder &AttrBuilder::addAllocSizeAttrFromRawRepr(uint64_t RawArgs) { } AttrBuilder &AttrBuilder::addVScaleRangeAttr(unsigned MinValue, - unsigned MaxValue) { + Optional<unsigned> MaxValue) { return addVScaleRangeAttrFromRawRepr(packVScaleRangeArgs(MinValue, MaxValue)); } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index d73d1e9c20b3..b8ad2b294b87 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -702,6 +702,31 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { NewFn = Intrinsic::getDeclaration(F->getParent(), IID, Tys); return true; } + + if (Name == "arm.mve.vctp64" && + cast<FixedVectorType>(F->getReturnType())->getNumElements() == 4) { + // A vctp64 returning a v4i1 is converted to return a v2i1. Rename the + // function and deal with it below in UpgradeIntrinsicCall. + rename(F); + return true; + } + // These too are changed to accept a v2i1 insteead of the old v4i1. + if (Name == "arm.mve.mull.int.predicated.v2i64.v4i32.v4i1" || + Name == "arm.mve.vqdmull.predicated.v2i64.v4i32.v4i1" || + Name == "arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" || + Name == "arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1" || + Name == "arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1" || + Name == "arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1" || + Name == "arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1" || + Name == "arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1" || + Name == "arm.cde.vcx1q.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx1qa.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx2q.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx2qa.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx3q.predicated.v2i64.v4i1" || + Name == "arm.cde.vcx3qa.predicated.v2i64.v4i1") + return true; + break; } @@ -1803,6 +1828,96 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) { } } +static Value *UpgradeARMIntrinsicCall(StringRef Name, CallInst *CI, Function *F, + IRBuilder<> &Builder) { + if (Name == "mve.vctp64.old") { + // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the + // correct type. + Value *VCTP = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), Intrinsic::arm_mve_vctp64), + CI->getArgOperand(0), CI->getName()); + Value *C1 = Builder.CreateCall( + Intrinsic::getDeclaration( + F->getParent(), Intrinsic::arm_mve_pred_v2i, + {VectorType::get(Builder.getInt1Ty(), 2, false)}), + VCTP); + return Builder.CreateCall( + Intrinsic::getDeclaration( + F->getParent(), Intrinsic::arm_mve_pred_i2v, + {VectorType::get(Builder.getInt1Ty(), 4, false)}), + C1); + } else if (Name == "mve.mull.int.predicated.v2i64.v4i32.v4i1" || + Name == "mve.vqdmull.predicated.v2i64.v4i32.v4i1" || + Name == "mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" || + Name == "mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1" || + Name == "mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1" || + Name == "mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1" || + Name == "mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1" || + Name == "mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1" || + Name == "cde.vcx1q.predicated.v2i64.v4i1" || + Name == "cde.vcx1qa.predicated.v2i64.v4i1" || + Name == "cde.vcx2q.predicated.v2i64.v4i1" || + Name == "cde.vcx2qa.predicated.v2i64.v4i1" || + Name == "cde.vcx3q.predicated.v2i64.v4i1" || + Name == "cde.vcx3qa.predicated.v2i64.v4i1") { + std::vector<Type *> Tys; + unsigned ID = CI->getIntrinsicID(); + Type *V2I1Ty = FixedVectorType::get(Builder.getInt1Ty(), 2); + switch (ID) { + case Intrinsic::arm_mve_mull_int_predicated: + case Intrinsic::arm_mve_vqdmull_predicated: + case Intrinsic::arm_mve_vldr_gather_base_predicated: + Tys = {CI->getType(), CI->getOperand(0)->getType(), V2I1Ty}; + break; + case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: + case Intrinsic::arm_mve_vstr_scatter_base_predicated: + case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: + Tys = {CI->getOperand(0)->getType(), CI->getOperand(0)->getType(), + V2I1Ty}; + break; + case Intrinsic::arm_mve_vldr_gather_offset_predicated: + Tys = {CI->getType(), CI->getOperand(0)->getType(), + CI->getOperand(1)->getType(), V2I1Ty}; + break; + case Intrinsic::arm_mve_vstr_scatter_offset_predicated: + Tys = {CI->getOperand(0)->getType(), CI->getOperand(1)->getType(), + CI->getOperand(2)->getType(), V2I1Ty}; + break; + case Intrinsic::arm_cde_vcx1q_predicated: + case Intrinsic::arm_cde_vcx1qa_predicated: + case Intrinsic::arm_cde_vcx2q_predicated: + case Intrinsic::arm_cde_vcx2qa_predicated: + case Intrinsic::arm_cde_vcx3q_predicated: + case Intrinsic::arm_cde_vcx3qa_predicated: + Tys = {CI->getOperand(1)->getType(), V2I1Ty}; + break; + default: + llvm_unreachable("Unhandled Intrinsic!"); + } + + std::vector<Value *> Ops; + for (Value *Op : CI->args()) { + Type *Ty = Op->getType(); + if (Ty->getScalarSizeInBits() == 1) { + Value *C1 = Builder.CreateCall( + Intrinsic::getDeclaration( + F->getParent(), Intrinsic::arm_mve_pred_v2i, + {VectorType::get(Builder.getInt1Ty(), 4, false)}), + Op); + Op = Builder.CreateCall( + Intrinsic::getDeclaration(F->getParent(), + Intrinsic::arm_mve_pred_i2v, {V2I1Ty}), + C1); + } + Ops.push_back(Op); + } + + Function *Fn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + return Builder.CreateCall(Fn, Ops, CI->getName()); + } + llvm_unreachable("Unknown function for ARM CallInst upgrade."); +} + /// Upgrade a call to an old intrinsic. All argument and return casting must be /// provided to seamlessly integrate with existing context. void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { @@ -1826,6 +1941,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { bool IsNVVM = Name.startswith("nvvm."); if (IsNVVM) Name = Name.substr(5); + bool IsARM = Name.startswith("arm."); + if (IsARM) + Name = Name.substr(4); if (IsX86 && Name.startswith("sse4a.movnt.")) { Module *M = F->getParent(); @@ -2289,14 +2407,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { if (CI->arg_size() >= 3) Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); - } else if (IsX86 && (Name.startswith("avx512.mask.loadu."))) { - Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0), - CI->getArgOperand(1), CI->getArgOperand(2), - /*Aligned*/false); - } else if (IsX86 && (Name.startswith("avx512.mask.load."))) { - Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0), - CI->getArgOperand(1),CI->getArgOperand(2), - /*Aligned*/true); + } else if (IsX86 && Name.startswith("avx512.mask.load")) { + // "avx512.mask.loadu." or "avx512.mask.load." + bool Aligned = Name[16] != 'u'; // "avx512.mask.loadu". + Rep = + UpgradeMaskedLoad(Builder, CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), Aligned); } else if (IsX86 && Name.startswith("avx512.mask.expand.load.")) { auto *ResultTy = cast<FixedVectorType>(CI->getType()); Type *PtrTy = ResultTy->getElementType(); @@ -3649,6 +3765,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { F->getParent(), Intrinsic::convert_from_fp16, {Builder.getFloatTy()}), CI->getArgOperand(0), "h2f"); + } else if (IsARM) { + Rep = UpgradeARMIntrinsicCall(Name, CI, F, Builder); } else { llvm_unreachable("Unknown function for CallInst upgrade."); } diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index ed1956e0f7e9..7beafc485d09 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -450,8 +450,8 @@ BasicBlock *BasicBlock::splitBasicBlockBefore(iterator I, const Twine &BBName) { void BasicBlock::replacePhiUsesWith(BasicBlock *Old, BasicBlock *New) { // N.B. This might not be a complete BasicBlock, so don't assume // that it ends with a non-phi instruction. - for (iterator II = begin(), IE = end(); II != IE; ++II) { - PHINode *PN = dyn_cast<PHINode>(II); + for (Instruction &I : *this) { + PHINode *PN = dyn_cast<PHINode>(&I); if (!PN) break; PN->replaceIncomingBlockWith(Old, New); diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index 437fd0558447..8668fe82601c 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -1801,46 +1801,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, } else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) { const APFloat &C1V = cast<ConstantFP>(C1)->getValueAPF(); const APFloat &C2V = cast<ConstantFP>(C2)->getValueAPF(); - APFloat::cmpResult R = C1V.compare(C2V); - switch (pred) { - default: llvm_unreachable("Invalid FCmp Predicate"); - case FCmpInst::FCMP_FALSE: return Constant::getNullValue(ResultTy); - case FCmpInst::FCMP_TRUE: return Constant::getAllOnesValue(ResultTy); - case FCmpInst::FCMP_UNO: - return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered); - case FCmpInst::FCMP_ORD: - return ConstantInt::get(ResultTy, R!=APFloat::cmpUnordered); - case FCmpInst::FCMP_UEQ: - return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered || - R==APFloat::cmpEqual); - case FCmpInst::FCMP_OEQ: - return ConstantInt::get(ResultTy, R==APFloat::cmpEqual); - case FCmpInst::FCMP_UNE: - return ConstantInt::get(ResultTy, R!=APFloat::cmpEqual); - case FCmpInst::FCMP_ONE: - return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan || - R==APFloat::cmpGreaterThan); - case FCmpInst::FCMP_ULT: - return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered || - R==APFloat::cmpLessThan); - case FCmpInst::FCMP_OLT: - return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan); - case FCmpInst::FCMP_UGT: - return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered || - R==APFloat::cmpGreaterThan); - case FCmpInst::FCMP_OGT: - return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan); - case FCmpInst::FCMP_ULE: - return ConstantInt::get(ResultTy, R!=APFloat::cmpGreaterThan); - case FCmpInst::FCMP_OLE: - return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan || - R==APFloat::cmpEqual); - case FCmpInst::FCMP_UGE: - return ConstantInt::get(ResultTy, R!=APFloat::cmpLessThan); - case FCmpInst::FCMP_OGE: - return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan || - R==APFloat::cmpEqual); - } + CmpInst::Predicate Predicate = CmpInst::Predicate(pred); + return ConstantInt::get(ResultTy, FCmpInst::compare(C1V, C2V, Predicate)); } else if (auto *C1VTy = dyn_cast<VectorType>(C1->getType())) { // Fast path for splatted constants. @@ -2215,9 +2177,8 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C, if (C->isNullValue()) { bool isNull = true; - for (unsigned i = 0, e = Idxs.size(); i != e; ++i) - if (!isa<UndefValue>(Idxs[i]) && - !cast<Constant>(Idxs[i])->isNullValue()) { + for (Value *Idx : Idxs) + if (!isa<UndefValue>(Idx) && !cast<Constant>(Idx)->isNullValue()) { isNull = false; break; } @@ -2233,8 +2194,8 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C, // The GEP returns a vector of pointers when one of more of // its arguments is a vector. - for (unsigned i = 0, e = Idxs.size(); i != e; ++i) { - if (auto *VT = dyn_cast<VectorType>(Idxs[i]->getType())) { + for (Value *Idx : Idxs) { + if (auto *VT = dyn_cast<VectorType>(Idx->getType())) { assert((!isa<VectorType>(GEPTy) || isa<ScalableVectorType>(GEPTy) == isa<ScalableVectorType>(VT)) && "Mismatched GEPTy vector types"); diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index c66cfb6e9ac1..837be910f6d8 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -535,6 +535,9 @@ void llvm::deleteConstant(Constant *C) { case Constant::DSOLocalEquivalentVal: delete static_cast<DSOLocalEquivalent *>(C); break; + case Constant::NoCFIValueVal: + delete static_cast<NoCFIValue *>(C); + break; case Constant::UndefValueVal: delete static_cast<UndefValue *>(C); break; @@ -1296,9 +1299,10 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) { if (V.empty()) return ConstantAggregateZero::get(Ty); - for (unsigned i = 0, e = V.size(); i != e; ++i) { - assert(V[i]->getType() == Ty->getElementType() && + for (Constant *C : V) { + assert(C->getType() == Ty->getElementType() && "Wrong type in array element initializer"); + (void)C; } // If this is an all-zero array, return a ConstantAggregateZero object. If @@ -1364,12 +1368,12 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) { isZero = V[0]->isNullValue(); // PoisonValue inherits UndefValue, so its check is not necessary. if (isUndef || isZero) { - for (unsigned i = 0, e = V.size(); i != e; ++i) { - if (!V[i]->isNullValue()) + for (Constant *C : V) { + if (!C->isNullValue()) isZero = false; - if (!isa<PoisonValue>(V[i])) + if (!isa<PoisonValue>(C)) isPoison = false; - if (isa<PoisonValue>(V[i]) || !isa<UndefValue>(V[i])) + if (isa<PoisonValue>(C) || !isa<UndefValue>(C)) isUndef = false; } } @@ -1962,6 +1966,47 @@ Value *DSOLocalEquivalent::handleOperandChangeImpl(Value *From, Value *To) { return nullptr; } +NoCFIValue *NoCFIValue::get(GlobalValue *GV) { + NoCFIValue *&NC = GV->getContext().pImpl->NoCFIValues[GV]; + if (!NC) + NC = new NoCFIValue(GV); + + assert(NC->getGlobalValue() == GV && + "NoCFIValue does not match the expected global value"); + return NC; +} + +NoCFIValue::NoCFIValue(GlobalValue *GV) + : Constant(GV->getType(), Value::NoCFIValueVal, &Op<0>(), 1) { + setOperand(0, GV); +} + +/// Remove the constant from the constant table. +void NoCFIValue::destroyConstantImpl() { + const GlobalValue *GV = getGlobalValue(); + GV->getContext().pImpl->NoCFIValues.erase(GV); +} + +Value *NoCFIValue::handleOperandChangeImpl(Value *From, Value *To) { + assert(From == getGlobalValue() && "Changing value does not match operand."); + + GlobalValue *GV = dyn_cast<GlobalValue>(To->stripPointerCasts()); + assert(GV && "Can only replace the operands with a global value"); + + NoCFIValue *&NewNC = getContext().pImpl->NoCFIValues[GV]; + if (NewNC) + return llvm::ConstantExpr::getBitCast(NewNC, getType()); + + getContext().pImpl->NoCFIValues.erase(getGlobalValue()); + NewNC = this; + setOperand(0, GV); + + if (GV->getType() != getType()) + mutateType(GV->getType()); + + return nullptr; +} + //---- ConstantExpr::get() implementations. // diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 2c396ae97499..a263d2536541 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -1696,6 +1696,14 @@ LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal, return wrap(ConstantExpr::getGetElementPtr(Ty, Val, IdxList)); } +LLVMValueRef LLVMConstGEP2(LLVMTypeRef Ty, LLVMValueRef ConstantVal, + LLVMValueRef *ConstantIndices, unsigned NumIndices) { + ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices), + NumIndices); + Constant *Val = unwrap<Constant>(ConstantVal); + return wrap(ConstantExpr::getGetElementPtr(unwrap(Ty), Val, IdxList)); +} + LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal, LLVMValueRef *ConstantIndices, unsigned NumIndices) { @@ -1707,6 +1715,15 @@ LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal, return wrap(ConstantExpr::getInBoundsGetElementPtr(Ty, Val, IdxList)); } +LLVMValueRef LLVMConstInBoundsGEP2(LLVMTypeRef Ty, LLVMValueRef ConstantVal, + LLVMValueRef *ConstantIndices, + unsigned NumIndices) { + ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices), + NumIndices); + Constant *Val = unwrap<Constant>(ConstantVal); + return wrap(ConstantExpr::getInBoundsGetElementPtr(unwrap(Ty), Val, IdxList)); +} + LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) { return wrap(ConstantExpr::getTrunc(unwrap<Constant>(ConstantVal), unwrap(ToType))); @@ -3007,13 +3024,17 @@ LLVMTypeRef LLVMGetAllocatedType(LLVMValueRef Alloca) { /*--.. Operations on gep instructions (only) ...............................--*/ LLVMBool LLVMIsInBounds(LLVMValueRef GEP) { - return unwrap<GetElementPtrInst>(GEP)->isInBounds(); + return unwrap<GEPOperator>(GEP)->isInBounds(); } void LLVMSetIsInBounds(LLVMValueRef GEP, LLVMBool InBounds) { return unwrap<GetElementPtrInst>(GEP)->setIsInBounds(InBounds); } +LLVMTypeRef LLVMGetGEPSourceElementType(LLVMValueRef GEP) { + return wrap(unwrap<GEPOperator>(GEP)->getSourceElementType()); +} + /*--.. Operations on phi nodes .............................................--*/ void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues, @@ -3039,7 +3060,7 @@ LLVMBasicBlockRef LLVMGetIncomingBlock(LLVMValueRef PhiNode, unsigned Index) { unsigned LLVMGetNumIndices(LLVMValueRef Inst) { auto *I = unwrap(Inst); - if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) + if (auto *GEP = dyn_cast<GEPOperator>(I)) return GEP->getNumIndices(); if (auto *EV = dyn_cast<ExtractValueInst>(I)) return EV->getNumIndices(); diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 548962bd6a98..35af22034a12 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -671,11 +671,11 @@ DIBuilder::getOrCreateMacroArray(ArrayRef<Metadata *> Elements) { DITypeRefArray DIBuilder::getOrCreateTypeArray(ArrayRef<Metadata *> Elements) { SmallVector<llvm::Metadata *, 16> Elts; - for (unsigned i = 0, e = Elements.size(); i != e; ++i) { - if (Elements[i] && isa<MDNode>(Elements[i])) - Elts.push_back(cast<DIType>(Elements[i])); + for (Metadata *E : Elements) { + if (isa_and_nonnull<MDNode>(E)) + Elts.push_back(cast<DIType>(E)); else - Elts.push_back(Elements[i]); + Elts.push_back(E); } return DITypeRefArray(MDNode::get(VMContext, Elts)); } diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp index 2ace18048262..61b2b13bfd03 100644 --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -124,26 +124,25 @@ LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const { // PointerAlignElem, PointerAlign support //===----------------------------------------------------------------------===// -PointerAlignElem PointerAlignElem::get(uint32_t AddressSpace, Align ABIAlign, - Align PrefAlign, uint32_t TypeByteWidth, - uint32_t IndexWidth) { +PointerAlignElem PointerAlignElem::getInBits(uint32_t AddressSpace, + Align ABIAlign, Align PrefAlign, + uint32_t TypeBitWidth, + uint32_t IndexBitWidth) { assert(ABIAlign <= PrefAlign && "Preferred alignment worse than ABI!"); PointerAlignElem retval; retval.AddressSpace = AddressSpace; retval.ABIAlign = ABIAlign; retval.PrefAlign = PrefAlign; - retval.TypeByteWidth = TypeByteWidth; - retval.IndexWidth = IndexWidth; + retval.TypeBitWidth = TypeBitWidth; + retval.IndexBitWidth = IndexBitWidth; return retval; } bool PointerAlignElem::operator==(const PointerAlignElem &rhs) const { - return (ABIAlign == rhs.ABIAlign - && AddressSpace == rhs.AddressSpace - && PrefAlign == rhs.PrefAlign - && TypeByteWidth == rhs.TypeByteWidth - && IndexWidth == rhs.IndexWidth); + return (ABIAlign == rhs.ABIAlign && AddressSpace == rhs.AddressSpace && + PrefAlign == rhs.PrefAlign && TypeBitWidth == rhs.TypeBitWidth && + IndexBitWidth == rhs.IndexBitWidth); } //===----------------------------------------------------------------------===// @@ -197,7 +196,7 @@ void DataLayout::reset(StringRef Desc) { E.PrefAlign, E.TypeBitWidth)) return report_fatal_error(std::move(Err)); } - if (Error Err = setPointerAlignment(0, Align(8), Align(8), 8, 8)) + if (Error Err = setPointerAlignmentInBits(0, Align(8), Align(8), 64, 64)) return report_fatal_error(std::move(Err)); if (Error Err = parseSpecifier(Desc)) @@ -318,7 +317,7 @@ Error DataLayout::parseSpecifier(StringRef Desc) { if (Error Err = ::split(Rest, ':', Split)) return Err; unsigned PointerMemSize; - if (Error Err = getIntInBytes(Tok, PointerMemSize)) + if (Error Err = getInt(Tok, PointerMemSize)) return Err; if (!PointerMemSize) return reportError("Invalid pointer size of 0 bytes"); @@ -354,13 +353,13 @@ Error DataLayout::parseSpecifier(StringRef Desc) { if (!Rest.empty()) { if (Error Err = ::split(Rest, ':', Split)) return Err; - if (Error Err = getIntInBytes(Tok, IndexSize)) + if (Error Err = getInt(Tok, IndexSize)) return Err; if (!IndexSize) return reportError("Invalid index size of 0 bytes"); } } - if (Error Err = setPointerAlignment( + if (Error Err = setPointerAlignmentInBits( AddrSpace, assumeAligned(PointerABIAlign), assumeAligned(PointerPrefAlign), PointerMemSize, IndexSize)) return Err; @@ -603,9 +602,10 @@ DataLayout::getPointerAlignElem(uint32_t AddressSpace) const { return Pointers[0]; } -Error DataLayout::setPointerAlignment(uint32_t AddrSpace, Align ABIAlign, - Align PrefAlign, uint32_t TypeByteWidth, - uint32_t IndexWidth) { +Error DataLayout::setPointerAlignmentInBits(uint32_t AddrSpace, Align ABIAlign, + Align PrefAlign, + uint32_t TypeBitWidth, + uint32_t IndexBitWidth) { if (PrefAlign < ABIAlign) return reportError( "Preferred alignment cannot be less than the ABI alignment"); @@ -615,13 +615,14 @@ Error DataLayout::setPointerAlignment(uint32_t AddrSpace, Align ABIAlign, return A.AddressSpace < AddressSpace; }); if (I == Pointers.end() || I->AddressSpace != AddrSpace) { - Pointers.insert(I, PointerAlignElem::get(AddrSpace, ABIAlign, PrefAlign, - TypeByteWidth, IndexWidth)); + Pointers.insert(I, + PointerAlignElem::getInBits(AddrSpace, ABIAlign, PrefAlign, + TypeBitWidth, IndexBitWidth)); } else { I->ABIAlign = ABIAlign; I->PrefAlign = PrefAlign; - I->TypeByteWidth = TypeByteWidth; - I->IndexWidth = IndexWidth; + I->TypeBitWidth = TypeBitWidth; + I->IndexBitWidth = IndexBitWidth; } return Error::success(); } @@ -704,13 +705,14 @@ Align DataLayout::getPointerPrefAlignment(unsigned AS) const { } unsigned DataLayout::getPointerSize(unsigned AS) const { - return getPointerAlignElem(AS).TypeByteWidth; + return divideCeil(getPointerAlignElem(AS).TypeBitWidth, 8); } unsigned DataLayout::getMaxIndexSize() const { unsigned MaxIndexSize = 0; for (auto &P : Pointers) - MaxIndexSize = std::max(MaxIndexSize, P.IndexWidth); + MaxIndexSize = + std::max(MaxIndexSize, (unsigned)divideCeil(P.TypeBitWidth, 8)); return MaxIndexSize; } @@ -723,7 +725,7 @@ unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const { } unsigned DataLayout::getIndexSize(unsigned AS) const { - return getPointerAlignElem(AS).IndexWidth; + return divideCeil(getPointerAlignElem(AS).IndexBitWidth, 8); } unsigned DataLayout::getIndexTypeSizeInBits(Type *Ty) const { @@ -901,16 +903,14 @@ int64_t DataLayout::getIndexedOffsetInType(Type *ElemTy, return Result; } -static void addElementIndex(SmallVectorImpl<APInt> &Indices, TypeSize ElemSize, - APInt &Offset) { +static APInt getElementIndex(TypeSize ElemSize, APInt &Offset) { // Skip over scalable or zero size elements. Also skip element sizes larger // than the positive index space, because the arithmetic below may not be // correct in that case. unsigned BitWidth = Offset.getBitWidth(); if (ElemSize.isScalable() || ElemSize == 0 || !isUIntN(BitWidth - 1, ElemSize)) { - Indices.push_back(APInt::getZero(BitWidth)); - return; + return APInt::getZero(BitWidth); } APInt Index = Offset.sdiv(ElemSize); @@ -921,47 +921,52 @@ static void addElementIndex(SmallVectorImpl<APInt> &Indices, TypeSize ElemSize, Offset += ElemSize; assert(Offset.isNonNegative() && "Remaining offset shouldn't be negative"); } - Indices.push_back(Index); + return Index; } -SmallVector<APInt> DataLayout::getGEPIndicesForOffset(Type *&ElemTy, - APInt &Offset) const { - assert(ElemTy->isSized() && "Element type must be sized"); - SmallVector<APInt> Indices; - addElementIndex(Indices, getTypeAllocSize(ElemTy), Offset); - while (Offset != 0) { - if (auto *ArrTy = dyn_cast<ArrayType>(ElemTy)) { - ElemTy = ArrTy->getElementType(); - addElementIndex(Indices, getTypeAllocSize(ElemTy), Offset); - continue; - } +Optional<APInt> DataLayout::getGEPIndexForOffset(Type *&ElemTy, + APInt &Offset) const { + if (auto *ArrTy = dyn_cast<ArrayType>(ElemTy)) { + ElemTy = ArrTy->getElementType(); + return getElementIndex(getTypeAllocSize(ElemTy), Offset); + } - if (auto *VecTy = dyn_cast<VectorType>(ElemTy)) { - ElemTy = VecTy->getElementType(); - unsigned ElemSizeInBits = getTypeSizeInBits(ElemTy).getFixedSize(); - // GEPs over non-multiple of 8 size vector elements are invalid. - if (ElemSizeInBits % 8 != 0) - break; + if (auto *VecTy = dyn_cast<VectorType>(ElemTy)) { + ElemTy = VecTy->getElementType(); + unsigned ElemSizeInBits = getTypeSizeInBits(ElemTy).getFixedSize(); + // GEPs over non-multiple of 8 size vector elements are invalid. + if (ElemSizeInBits % 8 != 0) + return None; - addElementIndex(Indices, TypeSize::Fixed(ElemSizeInBits / 8), Offset); - continue; - } + return getElementIndex(TypeSize::Fixed(ElemSizeInBits / 8), Offset); + } - if (auto *STy = dyn_cast<StructType>(ElemTy)) { - const StructLayout *SL = getStructLayout(STy); - uint64_t IntOffset = Offset.getZExtValue(); - if (IntOffset >= SL->getSizeInBytes()) - break; + if (auto *STy = dyn_cast<StructType>(ElemTy)) { + const StructLayout *SL = getStructLayout(STy); + uint64_t IntOffset = Offset.getZExtValue(); + if (IntOffset >= SL->getSizeInBytes()) + return None; - unsigned Index = SL->getElementContainingOffset(IntOffset); - Offset -= SL->getElementOffset(Index); - ElemTy = STy->getElementType(Index); - Indices.push_back(APInt(32, Index)); - continue; - } + unsigned Index = SL->getElementContainingOffset(IntOffset); + Offset -= SL->getElementOffset(Index); + ElemTy = STy->getElementType(Index); + return APInt(32, Index); + } + + // Non-aggregate type. + return None; +} - // Can't index into non-aggregate type. - break; +SmallVector<APInt> DataLayout::getGEPIndicesForOffset(Type *&ElemTy, + APInt &Offset) const { + assert(ElemTy->isSized() && "Element type must be sized"); + SmallVector<APInt> Indices; + Indices.push_back(getElementIndex(getTypeAllocSize(ElemTy), Offset)); + while (Offset != 0) { + Optional<APInt> Index = getGEPIndexForOffset(ElemTy, Offset); + if (!Index) + break; + Indices.push_back(*Index); } return Indices; diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 82b20a8af91b..f1a6402fb11b 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -980,7 +980,10 @@ enum IIT_Info { IIT_STRUCT9 = 49, IIT_V256 = 50, IIT_AMX = 51, - IIT_PPCF128 = 52 + IIT_PPCF128 = 52, + IIT_V3 = 53, + IIT_EXTERNREF = 54, + IIT_FUNCREF = 55 }; static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos, @@ -1056,6 +1059,10 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos, OutputTable.push_back(IITDescriptor::getVector(2, IsScalableVector)); DecodeIITType(NextElt, Infos, Info, OutputTable); return; + case IIT_V3: + OutputTable.push_back(IITDescriptor::getVector(3, IsScalableVector)); + DecodeIITType(NextElt, Infos, Info, OutputTable); + return; case IIT_V4: OutputTable.push_back(IITDescriptor::getVector(4, IsScalableVector)); DecodeIITType(NextElt, Infos, Info, OutputTable); @@ -1092,6 +1099,14 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos, OutputTable.push_back(IITDescriptor::getVector(1024, IsScalableVector)); DecodeIITType(NextElt, Infos, Info, OutputTable); return; + case IIT_EXTERNREF: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 10)); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0)); + return; + case IIT_FUNCREF: + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 20)); + OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 8)); + return; case IIT_PTR: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0)); DecodeIITType(NextElt, Infos, Info, OutputTable); diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp index 9f38288095e3..b6bd25aa1234 100644 --- a/llvm/lib/IR/Globals.cpp +++ b/llvm/lib/IR/Globals.cpp @@ -126,7 +126,7 @@ void GlobalObject::setAlignment(MaybeAlign Align) { void GlobalObject::copyAttributesFrom(const GlobalObject *Src) { GlobalValue::copyAttributesFrom(Src); - setAlignment(MaybeAlign(Src->getAlignment())); + setAlignment(Src->getAlign()); setSection(Src->getSection()); } @@ -249,7 +249,7 @@ bool GlobalObject::canIncreaseAlignment() const { // alignment specified. (If it is assigned a section, the global // could be densely packed with other objects in the section, and // increasing the alignment could cause padding issues.) - if (hasSection() && getAlignment() > 0) + if (hasSection() && getAlign().hasValue()) return false; // On ELF platforms, we're further restricted in that we can't diff --git a/llvm/lib/IR/InlineAsm.cpp b/llvm/lib/IR/InlineAsm.cpp index 56932b457225..a0c48781ced5 100644 --- a/llvm/lib/IR/InlineAsm.cpp +++ b/llvm/lib/IR/InlineAsm.cpp @@ -262,12 +262,12 @@ bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) { unsigned NumOutputs = 0, NumInputs = 0, NumClobbers = 0; unsigned NumIndirect = 0; - for (unsigned i = 0, e = Constraints.size(); i != e; ++i) { - switch (Constraints[i].Type) { + for (const ConstraintInfo &Constraint : Constraints) { + switch (Constraint.Type) { case InlineAsm::isOutput: if ((NumInputs-NumIndirect) != 0 || NumClobbers != 0) return false; // outputs before inputs and clobbers. - if (!Constraints[i].isIndirect) { + if (!Constraint.isIndirect) { ++NumOutputs; break; } diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index a4659da7e807..4480ec799c35 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -166,7 +166,10 @@ void Instruction::dropPoisonGeneratingFlags() { cast<GetElementPtrInst>(this)->setIsInBounds(false); break; } - // TODO: FastMathFlags! + if (isa<FPMathOperator>(this)) { + setHasNoNaNs(false); + setHasNoInfs(false); + } assert(!hasPoisonGeneratingFlags() && "must be kept in sync"); } @@ -436,17 +439,17 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2, if (const AllocaInst *AI = dyn_cast<AllocaInst>(I1)) return AI->getAllocatedType() == cast<AllocaInst>(I2)->getAllocatedType() && - (AI->getAlignment() == cast<AllocaInst>(I2)->getAlignment() || + (AI->getAlign() == cast<AllocaInst>(I2)->getAlign() || IgnoreAlignment); if (const LoadInst *LI = dyn_cast<LoadInst>(I1)) return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() && - (LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() || + (LI->getAlign() == cast<LoadInst>(I2)->getAlign() || IgnoreAlignment) && LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() && LI->getSyncScopeID() == cast<LoadInst>(I2)->getSyncScopeID(); if (const StoreInst *SI = dyn_cast<StoreInst>(I1)) return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() && - (SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() || + (SI->getAlign() == cast<StoreInst>(I2)->getAlign() || IgnoreAlignment) && SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() && SI->getSyncScopeID() == cast<StoreInst>(I2)->getSyncScopeID(); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index ad27a6d8c08e..7798af3b19b9 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1410,8 +1410,6 @@ bool AllocaInst::isStaticAlloca() const { void LoadInst::AssertOK() { assert(getOperand(0)->getType()->isPointerTy() && "Ptr must have pointer type."); - assert(!(isAtomic() && getAlignment() == 0) && - "Alignment required for atomic load"); } static Align computeLoadStoreDefaultAlign(Type *Ty, BasicBlock *BB) { @@ -1490,8 +1488,6 @@ void StoreInst::AssertOK() { assert(cast<PointerType>(getOperand(1)->getType()) ->isOpaqueOrPointeeTypeMatches(getOperand(0)->getType()) && "Ptr must be a pointer to Val type!"); - assert(!(isAtomic() && getAlignment() == 0) && - "Alignment required for atomic store"); } StoreInst::StoreInst(Value *val, Value *addr, Instruction *InsertBefore) @@ -2328,7 +2324,6 @@ bool ShuffleVectorInst::isInsertSubvectorMask(ArrayRef<int> Mask, } Src1Elts.setBit(i); Src1Identity &= (M == (i + NumSrcElts)); - continue; } assert((Src0Elts | Src1Elts | UndefElts).isAllOnes() && "unknown shuffle elements"); @@ -4165,6 +4160,47 @@ bool ICmpInst::compare(const APInt &LHS, const APInt &RHS, }; } +bool FCmpInst::compare(const APFloat &LHS, const APFloat &RHS, + FCmpInst::Predicate Pred) { + APFloat::cmpResult R = LHS.compare(RHS); + switch (Pred) { + default: + llvm_unreachable("Invalid FCmp Predicate"); + case FCmpInst::FCMP_FALSE: + return false; + case FCmpInst::FCMP_TRUE: + return true; + case FCmpInst::FCMP_UNO: + return R == APFloat::cmpUnordered; + case FCmpInst::FCMP_ORD: + return R != APFloat::cmpUnordered; + case FCmpInst::FCMP_UEQ: + return R == APFloat::cmpUnordered || R == APFloat::cmpEqual; + case FCmpInst::FCMP_OEQ: + return R == APFloat::cmpEqual; + case FCmpInst::FCMP_UNE: + return R != APFloat::cmpEqual; + case FCmpInst::FCMP_ONE: + return R == APFloat::cmpLessThan || R == APFloat::cmpGreaterThan; + case FCmpInst::FCMP_ULT: + return R == APFloat::cmpUnordered || R == APFloat::cmpLessThan; + case FCmpInst::FCMP_OLT: + return R == APFloat::cmpLessThan; + case FCmpInst::FCMP_UGT: + return R == APFloat::cmpUnordered || R == APFloat::cmpGreaterThan; + case FCmpInst::FCMP_OGT: + return R == APFloat::cmpGreaterThan; + case FCmpInst::FCMP_ULE: + return R != APFloat::cmpGreaterThan; + case FCmpInst::FCMP_OLE: + return R == APFloat::cmpLessThan || R == APFloat::cmpEqual; + case FCmpInst::FCMP_UGE: + return R != APFloat::cmpLessThan; + case FCmpInst::FCMP_OGE: + return R == APFloat::cmpGreaterThan || R == APFloat::cmpEqual; + } +} + CmpInst::Predicate CmpInst::getFlippedSignednessPredicate(Predicate pred) { assert(CmpInst::isRelational(pred) && "Call only with non-equality predicates!"); @@ -4411,7 +4447,7 @@ void SwitchInstProfUpdateWrapper::addCase( Weights.getValue()[SI.getNumSuccessors() - 1] = *W; } else if (Weights) { Changed = true; - Weights.getValue().push_back(W ? *W : 0); + Weights.getValue().push_back(W.getValueOr(0)); } if (Weights) assert(SI.getNumSuccessors() == Weights->size() && diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 9206cd37a6d1..8f7318665cfb 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -468,6 +468,7 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const { } Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, + Type *ReturnType, ArrayRef<Value *> Params) { assert(isVPIntrinsic(VPID) && "not a VP intrinsic"); Function *VPFunc; @@ -486,22 +487,15 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, break; case Intrinsic::vp_load: VPFunc = Intrinsic::getDeclaration( - M, VPID, - {Params[0]->getType()->getPointerElementType(), Params[0]->getType()}); + M, VPID, {ReturnType, Params[0]->getType()}); break; case Intrinsic::vp_gather: VPFunc = Intrinsic::getDeclaration( - M, VPID, - {VectorType::get(cast<VectorType>(Params[0]->getType()) - ->getElementType() - ->getPointerElementType(), - cast<VectorType>(Params[0]->getType())), - Params[0]->getType()}); + M, VPID, {ReturnType, Params[0]->getType()}); break; case Intrinsic::vp_store: VPFunc = Intrinsic::getDeclaration( - M, VPID, - {Params[1]->getType()->getPointerElementType(), Params[1]->getType()}); + M, VPID, {Params[0]->getType(), Params[1]->getType()}); break; case Intrinsic::vp_scatter: VPFunc = Intrinsic::getDeclaration( diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index b2909c425846..24c4a348f4da 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -386,8 +386,9 @@ template <> struct MDNodeKeyImpl<DIEnumerator> { IsUnsigned(N->isUnsigned()) {} bool isKeyOf(const DIEnumerator *RHS) const { - return APInt::isSameValue(Value, RHS->getValue()) && - IsUnsigned == RHS->isUnsigned() && Name == RHS->getRawName(); + return Value.getBitWidth() == RHS->getValue().getBitWidth() && + Value == RHS->getValue() && IsUnsigned == RHS->isUnsigned() && + Name == RHS->getRawName(); } unsigned getHashValue() const { return hash_combine(Value, Name); } @@ -1424,6 +1425,8 @@ public: DenseMap<const GlobalValue *, DSOLocalEquivalent *> DSOLocalEquivalents; + DenseMap<const GlobalValue *, NoCFIValue *> NoCFIValues; + ConstantUniqueMap<ConstantExpr> ExprConstants; ConstantUniqueMap<InlineAsm> InlineAsms; diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index 7bccf09012ca..bb72bec93066 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -886,9 +886,8 @@ void PMDataManager::recordAvailableAnalysis(Pass *P) { // implements as well. const PassInfo *PInf = TPM->findAnalysisPassInfo(PI); if (!PInf) return; - const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented(); - for (unsigned i = 0, e = II.size(); i != e; ++i) - AvailableAnalysis[II[i]->getTypeInfo()] = P; + for (const PassInfo *PI : PInf->getInterfacesImplemented()) + AvailableAnalysis[PI->getTypeInfo()] = P; } // Return true if P preserves high level analysis used by other @@ -1013,10 +1012,9 @@ void PMDataManager::freePass(Pass *P, StringRef Msg, // Remove all interfaces this pass implements, for which it is also // listed as the available implementation. - const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented(); - for (unsigned i = 0, e = II.size(); i != e; ++i) { - DenseMap<AnalysisID, Pass*>::iterator Pos = - AvailableAnalysis.find(II[i]->getTypeInfo()); + for (const PassInfo *PI : PInf->getInterfacesImplemented()) { + DenseMap<AnalysisID, Pass *>::iterator Pos = + AvailableAnalysis.find(PI->getTypeInfo()); if (Pos != AvailableAnalysis.end() && Pos->second == P) AvailableAnalysis.erase(Pos); } diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 63ea41fba89a..a0485a59d0e0 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -750,8 +750,8 @@ void Module::setSDKVersion(const VersionTuple &V) { ConstantDataArray::get(Context, Entries)); } -VersionTuple Module::getSDKVersion() const { - auto *CM = dyn_cast_or_null<ConstantAsMetadata>(getModuleFlag("SDK Version")); +static VersionTuple getSDKVersionMD(Metadata *MD) { + auto *CM = dyn_cast_or_null<ConstantAsMetadata>(MD); if (!CM) return {}; auto *Arr = dyn_cast_or_null<ConstantDataArray>(CM->getValue()); @@ -775,6 +775,10 @@ VersionTuple Module::getSDKVersion() const { return Result; } +VersionTuple Module::getSDKVersion() const { + return getSDKVersionMD(getModuleFlag("SDK Version")); +} + GlobalVariable *llvm::collectUsedGlobalVariables( const Module &M, SmallVectorImpl<GlobalValue *> &Vec, bool CompilerUsed) { const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used"; @@ -809,3 +813,13 @@ void Module::setPartialSampleProfileRatio(const ModuleSummaryIndex &Index) { } } } + +StringRef Module::getDarwinTargetVariantTriple() const { + if (const auto *MD = getModuleFlag("darwin.target_variant.triple")) + return cast<MDString>(MD)->getString(); + return ""; +} + +VersionTuple Module::getDarwinTargetVariantSDKVersion() const { + return getSDKVersionMD(getModuleFlag("darwin.target_variant.SDK Version")); +} diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp index 31c5cd938d03..a0ac7d3ad7d3 100644 --- a/llvm/lib/IR/ModuleSummaryIndex.cpp +++ b/llvm/lib/IR/ModuleSummaryIndex.cpp @@ -447,11 +447,17 @@ static std::string linkageToString(GlobalValue::LinkageTypes LT) { static std::string fflagsToString(FunctionSummary::FFlags F) { auto FlagValue = [](unsigned V) { return V ? '1' : '0'; }; - char FlagRep[] = {FlagValue(F.ReadNone), FlagValue(F.ReadOnly), - FlagValue(F.NoRecurse), FlagValue(F.ReturnDoesNotAlias), - FlagValue(F.NoInline), FlagValue(F.AlwaysInline), - FlagValue(F.NoUnwind), FlagValue(F.MayThrow), - FlagValue(F.HasUnknownCall), 0}; + char FlagRep[] = {FlagValue(F.ReadNone), + FlagValue(F.ReadOnly), + FlagValue(F.NoRecurse), + FlagValue(F.ReturnDoesNotAlias), + FlagValue(F.NoInline), + FlagValue(F.AlwaysInline), + FlagValue(F.NoUnwind), + FlagValue(F.MayThrow), + FlagValue(F.HasUnknownCall), + FlagValue(F.MustBeUnreachable), + 0}; return FlagRep; } diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp index d15fcfbc5b9f..08c1fc931e2e 100644 --- a/llvm/lib/IR/Operator.cpp +++ b/llvm/lib/IR/Operator.cpp @@ -39,9 +39,10 @@ bool Operator::hasPoisonGeneratingFlags() const { return GEP->isInBounds() || GEP->getInRangeIndex() != None; } default: + if (const auto *FP = dyn_cast<FPMathOperator>(this)) + return FP->hasNoNaNs() || FP->hasNoInfs(); return false; } - // TODO: FastMathFlags! (On instructions, but not constexpr) } Type *GEPOperator::getSourceElementType() const { @@ -89,7 +90,7 @@ bool GEPOperator::accumulateConstantOffset( assert(Offset.getBitWidth() == DL.getIndexSizeInBits(getPointerAddressSpace()) && "The offset bit width does not match DL specification."); - SmallVector<const Value *> Index(value_op_begin() + 1, value_op_end()); + SmallVector<const Value *> Index(llvm::drop_begin(operand_values())); return GEPOperator::accumulateConstantOffset(getSourceElementType(), Index, DL, Offset, ExternalAnalysis); } diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp new file mode 100644 index 000000000000..a96e39f32882 --- /dev/null +++ b/llvm/lib/IR/SSAContext.cpp @@ -0,0 +1,47 @@ +//===- SSAContext.cpp -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// This file defines a specialization of the GenericSSAContext<X> +/// template class for LLVM IR. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/SSAContext.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +BasicBlock *SSAContext::getEntryBlock(Function &F) { + return &F.getEntryBlock(); +} + +void SSAContext::setFunction(Function &Fn) { F = &Fn; } + +Printable SSAContext::print(Value *V) const { + return Printable([V](raw_ostream &Out) { V->print(Out); }); +} + +Printable SSAContext::print(Instruction *Inst) const { + return print(cast<Value>(Inst)); +} + +Printable SSAContext::print(BasicBlock *BB) const { + if (BB->hasName()) + return Printable([BB](raw_ostream &Out) { Out << BB->getName(); }); + + return Printable([BB](raw_ostream &Out) { + ModuleSlotTracker MST{BB->getParent()->getParent(), false}; + MST.incorporateFunction(*BB->getParent()); + Out << MST.getLocalSlot(BB); + }); +} diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index b475c8327874..8741ed917f9f 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -928,7 +928,7 @@ Align Value::getPointerAlignment(const DataLayout &DL) const { } llvm_unreachable("Unhandled FunctionPtrAlignType"); } - const MaybeAlign Alignment(GO->getAlignment()); + const MaybeAlign Alignment(GO->getAlign()); if (!Alignment) { if (auto *GVar = dyn_cast<GlobalVariable>(GO)) { Type *ObjectType = GVar->getValueType(); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 154b59835b01..fb7c423e54e2 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -543,7 +543,7 @@ private: void verifySwiftErrorCall(CallBase &Call, const Value *SwiftErrorVal); void verifySwiftErrorValue(const Value *SwiftErrorVal); - void verifyTailCCMustTailAttrs(AttrBuilder Attrs, StringRef Context); + void verifyTailCCMustTailAttrs(const AttrBuilder &Attrs, StringRef Context); void verifyMustTailCall(CallInst &CI); bool verifyAttributeCount(AttributeList Attrs, unsigned Params); void verifyAttributeTypes(AttributeSet Attrs, const Value *V); @@ -553,8 +553,6 @@ private: void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, const Value *V, bool IsIntrinsic); void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs); - template <typename T> - void verifyODRTypeAsScopeOperand(const MDNode &MD, T * = nullptr); void visitConstantExprsRecursively(const Constant *EntryC); void visitConstantExpr(const ConstantExpr *CE); @@ -604,26 +602,35 @@ void Verifier::visit(Instruction &I) { InstVisitor<Verifier>::visit(I); } -// Helper to recursively iterate over indirect users. By -// returning false, the callback can ask to stop recursing -// further. +// Helper to iterate over indirect users. By returning false, the callback can ask to stop traversing further. static void forEachUser(const Value *User, SmallPtrSet<const Value *, 32> &Visited, llvm::function_ref<bool(const Value *)> Callback) { if (!Visited.insert(User).second) return; - for (const Value *TheNextUser : User->materialized_users()) - if (Callback(TheNextUser)) - forEachUser(TheNextUser, Visited, Callback); + + SmallVector<const Value *> WorkList; + append_range(WorkList, User->materialized_users()); + while (!WorkList.empty()) { + const Value *Cur = WorkList.pop_back_val(); + if (!Visited.insert(Cur).second) + continue; + if (Callback(Cur)) + append_range(WorkList, Cur->materialized_users()); + } } void Verifier::visitGlobalValue(const GlobalValue &GV) { Assert(!GV.isDeclaration() || GV.hasValidDeclarationLinkage(), "Global is external, but doesn't have external or weak linkage!", &GV); - if (const GlobalObject *GO = dyn_cast<GlobalObject>(&GV)) - Assert(GO->getAlignment() <= Value::MaximumAlignment, - "huge alignment values are unsupported", GO); + if (const GlobalObject *GO = dyn_cast<GlobalObject>(&GV)) { + + if (MaybeAlign A = GO->getAlign()) { + Assert(A->value() <= Value::MaximumAlignment, + "huge alignment values are unsupported", GO); + } + } Assert(!GV.hasAppendingLinkage() || isa<GlobalVariable>(GV), "Only global variables can have appending linkage!", &GV); @@ -733,8 +740,9 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) { Value *V = Op->stripPointerCasts(); Assert(isa<GlobalVariable>(V) || isa<Function>(V) || isa<GlobalAlias>(V), - "invalid llvm.used member", V); - Assert(V->hasName(), "members of llvm.used must be named", V); + Twine("invalid ") + GV.getName() + " member", V); + Assert(V->hasName(), + Twine("members of ") + GV.getName() + " must be named", V); } } } @@ -860,19 +868,6 @@ void Verifier::visitNamedMDNode(const NamedMDNode &NMD) { } } -template <typename T> -void Verifier::verifyODRTypeAsScopeOperand(const MDNode &MD, T *) { - if (isa<T>(MD)) { - if (auto *N = dyn_cast_or_null<DICompositeType>(cast<T>(MD).getScope())) - // Of all the supported tags for DICompositeType(see visitDICompositeType) - // we know that enum type cannot be a scope. - AssertDI(N->getTag() != dwarf::DW_TAG_enumeration_type, - "enum type is not a scope; check enum type ODR " - "violation", - N, &MD); - } -} - void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) { // Only visit each node once. Metadata can be mutually recursive, so this // avoids infinite recursion here, as well as being an optimization. @@ -882,12 +877,6 @@ void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) { Assert(&MD.getContext() == &Context, "MDNode context does not match Module context!", &MD); - // Makes sure when a scope operand is a ODR type, the ODR type uniquing does - // not create invalid debug metadata. - // TODO: check that the non-ODR-type scope operand is valid. - verifyODRTypeAsScopeOperand<DIType>(MD); - verifyODRTypeAsScopeOperand<DILocalScope>(MD); - switch (MD.getMetadataID()) { default: llvm_unreachable("Invalid MDNode subclass"); @@ -2055,10 +2044,12 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, } if (Attrs.hasFnAttr(Attribute::VScaleRange)) { - std::pair<unsigned, unsigned> Args = - Attrs.getFnAttrs().getVScaleRangeArgs(); + unsigned VScaleMin = Attrs.getFnAttrs().getVScaleRangeMin(); + if (VScaleMin == 0) + CheckFailed("'vscale_range' minimum must be greater than 0", V); - if (Args.first > Args.second && Args.second != 0) + Optional<unsigned> VScaleMax = Attrs.getFnAttrs().getVScaleRangeMax(); + if (VScaleMax && VScaleMin > VScaleMax) CheckFailed("'vscale_range' minimum cannot be greater than maximum", V); } @@ -3328,7 +3319,7 @@ void Verifier::visitCallBase(CallBase &Call) { visitInstruction(Call); } -void Verifier::verifyTailCCMustTailAttrs(AttrBuilder Attrs, +void Verifier::verifyTailCCMustTailAttrs(const AttrBuilder &Attrs, StringRef Context) { Assert(!Attrs.contains(Attribute::InAlloca), Twine("inalloca attribute not allowed in ") + Context); @@ -3733,15 +3724,15 @@ void Verifier::visitLoadInst(LoadInst &LI) { PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType()); Assert(PTy, "Load operand must be a pointer.", &LI); Type *ElTy = LI.getType(); - Assert(LI.getAlignment() <= Value::MaximumAlignment, - "huge alignment values are unsupported", &LI); + if (MaybeAlign A = LI.getAlign()) { + Assert(A->value() <= Value::MaximumAlignment, + "huge alignment values are unsupported", &LI); + } Assert(ElTy->isSized(), "loading unsized types is not allowed", &LI); if (LI.isAtomic()) { Assert(LI.getOrdering() != AtomicOrdering::Release && LI.getOrdering() != AtomicOrdering::AcquireRelease, "Load cannot have Release ordering", &LI); - Assert(LI.getAlignment() != 0, - "Atomic load must specify explicit alignment", &LI); Assert(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(), "atomic load operand must have integer, pointer, or floating point " "type!", @@ -3761,15 +3752,15 @@ void Verifier::visitStoreInst(StoreInst &SI) { Type *ElTy = SI.getOperand(0)->getType(); Assert(PTy->isOpaqueOrPointeeTypeMatches(ElTy), "Stored value type does not match pointer operand type!", &SI, ElTy); - Assert(SI.getAlignment() <= Value::MaximumAlignment, - "huge alignment values are unsupported", &SI); + if (MaybeAlign A = SI.getAlign()) { + Assert(A->value() <= Value::MaximumAlignment, + "huge alignment values are unsupported", &SI); + } Assert(ElTy->isSized(), "storing unsized types is not allowed", &SI); if (SI.isAtomic()) { Assert(SI.getOrdering() != AtomicOrdering::Acquire && SI.getOrdering() != AtomicOrdering::AcquireRelease, "Store cannot have Acquire ordering", &SI); - Assert(SI.getAlignment() != 0, - "Atomic store must specify explicit alignment", &SI); Assert(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(), "atomic store operand must have integer, pointer, or floating point " "type!", @@ -3820,8 +3811,10 @@ void Verifier::visitAllocaInst(AllocaInst &AI) { "Cannot allocate unsized type", &AI); Assert(AI.getArraySize()->getType()->isIntegerTy(), "Alloca array size must have integer type", &AI); - Assert(AI.getAlignment() <= Value::MaximumAlignment, - "huge alignment values are unsupported", &AI); + if (MaybeAlign A = AI.getAlign()) { + Assert(A->value() <= Value::MaximumAlignment, + "huge alignment values are unsupported", &AI); + } if (AI.isSwiftError()) { verifySwiftErrorValue(&AI); diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 6ce2ed265739..f26ef4b21996 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1106,7 +1106,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule)) - return Error::success(); + return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); if (!Conf.CodeGenOnly) { for (const auto &R : GlobalResolutions) { @@ -1132,7 +1132,7 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { if (Conf.PostInternalizeModuleHook && !Conf.PostInternalizeModuleHook(0, *RegularLTO.CombinedModule)) - return Error::success(); + return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile)); } if (!RegularLTO.EmptyCombinedModule || Conf.AlwaysEmitRegularLTOObj) { diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index be06556b0c3b..855d0fc8a8be 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -37,7 +37,6 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" -#include "llvm/Support/SmallVectorMemoryBuffer.h" #include "llvm/Support/ThreadPool.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -413,6 +412,8 @@ static void codegen(const Config &Conf, TargetMachine *TM, if (Error Err = StreamOrErr.takeError()) report_fatal_error(std::move(Err)); std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr; + TM->Options.ObjectFilenameForDebug = Stream->ObjectPathName; + legacy::PassManager CodeGenPasses; CodeGenPasses.add( createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex)); diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index 088e45c9e8dc..fdc9896aca78 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -135,9 +135,8 @@ LTOCodeGenerator::LTOCodeGenerator(LLVMContext &Context) LTOCodeGenerator::~LTOCodeGenerator() {} void LTOCodeGenerator::setAsmUndefinedRefs(LTOModule *Mod) { - const std::vector<StringRef> &undefs = Mod->getAsmUndefinedRefs(); - for (int i = 0, e = undefs.size(); i != e; ++i) - AsmUndefinedRefs.insert(undefs[i]); + for (const StringRef &Undef : Mod->getAsmUndefinedRefs()) + AsmUndefinedRefs.insert(Undef); } bool LTOCodeGenerator::addModule(LTOModule *Mod) { diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index 9474d8c9dafb..9aea27f0fdba 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -378,7 +378,8 @@ std::unique_ptr<MemoryBuffer> codegenModule(Module &TheModule, // Run codegen now. resulting binary is in OutputBuffer. PM.run(TheModule); } - return std::make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer)); + return std::make_unique<SmallVectorMemoryBuffer>( + std::move(OutputBuffer), /*RequiresNullTerminator=*/false); } /// Manage caching for a single Module. @@ -541,7 +542,8 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index, auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI); WriteBitcodeToFile(TheModule, OS, true, &Index); } - return std::make_unique<SmallVectorMemoryBuffer>(std::move(OutputBuffer)); + return std::make_unique<SmallVectorMemoryBuffer>( + std::move(OutputBuffer), /*RequiresNullTerminator=*/false); } return codegenModule(TheModule, TM); diff --git a/llvm/lib/LineEditor/LineEditor.cpp b/llvm/lib/LineEditor/LineEditor.cpp index 1aa3476eb357..37c4b79f8e29 100644 --- a/llvm/lib/LineEditor/LineEditor.cpp +++ b/llvm/lib/LineEditor/LineEditor.cpp @@ -69,9 +69,8 @@ LineEditor::ListCompleterConcept::complete(StringRef Buffer, size_t Pos) const { // common prefix will then be empty. if (CommonPrefix.empty()) { Action.Kind = CompletionAction::AK_ShowCompletions; - for (std::vector<Completion>::iterator I = Comps.begin(), E = Comps.end(); - I != E; ++I) - Action.Completions.push_back(I->DisplayText); + for (const Completion &Comp : Comps) + Action.Completions.push_back(Comp.DisplayText); } else { Action.Kind = CompletionAction::AK_Insert; Action.Text = CommonPrefix; diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index bad483be197d..b475ea81d107 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -646,7 +646,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) { /*init*/ nullptr, SGVar->getName(), /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(), SGVar->getAddressSpace()); - NewDGV->setAlignment(MaybeAlign(SGVar->getAlignment())); + NewDGV->setAlignment(SGVar->getAlign()); NewDGV->copyAttributesFrom(SGVar); return NewDGV; } @@ -877,7 +877,7 @@ IRLinker::linkAppendingVarProto(GlobalVariable *DstGV, if (DstGV->isConstant() != SrcGV->isConstant()) return stringErr("Appending variables linked with different const'ness!"); - if (DstGV->getAlignment() != SrcGV->getAlignment()) + if (DstGV->getAlign() != SrcGV->getAlign()) return stringErr( "Appending variables with different alignment need to be linked!"); diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 2ca921017171..5c2aaddff4d1 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -168,9 +168,14 @@ public: unsigned Update, VersionTuple SDKVersion) override; void emitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor, unsigned Update, VersionTuple SDKVersion) override; + void emitDarwinTargetVariantBuildVersion(unsigned Platform, unsigned Major, + unsigned Minor, unsigned Update, + VersionTuple SDKVersion) override; void emitThumbFunc(MCSymbol *Func) override; void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override; + void emitConditionalAssignment(MCSymbol *Symbol, + const MCExpr *Value) override; void emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override; bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override; @@ -640,6 +645,12 @@ void MCAsmStreamer::emitBuildVersion(unsigned Platform, unsigned Major, EmitEOL(); } +void MCAsmStreamer::emitDarwinTargetVariantBuildVersion( + unsigned Platform, unsigned Major, unsigned Minor, unsigned Update, + VersionTuple SDKVersion) { + emitBuildVersion(Platform, Major, Minor, Update, SDKVersion); +} + void MCAsmStreamer::emitThumbFunc(MCSymbol *Func) { // This needs to emit to a temporary string to get properly quoted // MCSymbols when they have spaces in them. @@ -670,6 +681,15 @@ void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { MCStreamer::emitAssignment(Symbol, Value); } +void MCAsmStreamer::emitConditionalAssignment(MCSymbol *Symbol, + const MCExpr *Value) { + OS << ".lto_set_conditional "; + Symbol->print(OS, MAI); + OS << ", "; + Value->print(OS, MAI); + EmitEOL(); +} + void MCAsmStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) { OS << ".weakref "; Alias->print(OS, MAI); diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index d5e9f4fc66bc..a8837bbf57c7 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -89,6 +89,7 @@ MCAssembler::MCAssembler(MCContext &Context, BundleAlignSize(0), RelaxAll(false), SubsectionsViaSymbols(false), IncrementalLinkerCompatible(false), ELFHeaderEFlags(0) { VersionInfo.Major = 0; // Major version == 0 for "none specified" + DarwinTargetVariantVersionInfo.Major = 0; } MCAssembler::~MCAssembler() = default; @@ -109,6 +110,8 @@ void MCAssembler::reset() { LOHContainer.reset(); VersionInfo.Major = 0; VersionInfo.SDKVersion = VersionTuple(); + DarwinTargetVariantVersionInfo.Major = 0; + DarwinTargetVariantVersionInfo.SDKVersion = VersionTuple(); // reset objects owned by us if (getBackendPtr()) diff --git a/llvm/lib/MC/MCInstrAnalysis.cpp b/llvm/lib/MC/MCInstrAnalysis.cpp index 52b59185c6fc..4ed1c6286a72 100644 --- a/llvm/lib/MC/MCInstrAnalysis.cpp +++ b/llvm/lib/MC/MCInstrAnalysis.cpp @@ -39,4 +39,4 @@ Optional<uint64_t> MCInstrAnalysis::getMemoryOperandRelocationOffset(const MCInst &Inst, uint64_t Size) const { return None; -}
\ No newline at end of file +} diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index aa94b141d8be..3edf7a3f49e6 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -92,6 +92,9 @@ public: unsigned Update, VersionTuple SDKVersion) override; void emitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor, unsigned Update, VersionTuple SDKVersion) override; + void emitDarwinTargetVariantBuildVersion(unsigned Platform, unsigned Major, + unsigned Minor, unsigned Update, + VersionTuple SDKVersion) override; void emitThumbFunc(MCSymbol *Func) override; bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override; void emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override; @@ -283,6 +286,13 @@ void MCMachOStreamer::emitBuildVersion(unsigned Platform, unsigned Major, Update, SDKVersion); } +void MCMachOStreamer::emitDarwinTargetVariantBuildVersion( + unsigned Platform, unsigned Major, unsigned Minor, unsigned Update, + VersionTuple SDKVersion) { + getAssembler().setDarwinTargetVariantBuildVersion( + (MachO::PlatformType)Platform, Major, Minor, Update, SDKVersion); +} + void MCMachOStreamer::emitThumbFunc(MCSymbol *Symbol) { // Remember that the function is a thumb function. Fixup and relocation // values will need adjusted. @@ -516,7 +526,10 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context, new MCMachOStreamer(Context, std::move(MAB), std::move(OW), std::move(CE), DWARFMustBeAtTheEnd, LabelSections); const Triple &Target = Context.getTargetTriple(); - S->emitVersionForTarget(Target, Context.getObjectFileInfo()->getSDKVersion()); + S->emitVersionForTarget( + Target, Context.getObjectFileInfo()->getSDKVersion(), + Context.getObjectFileInfo()->getDarwinTargetVariantTriple(), + Context.getObjectFileInfo()->getDarwinTargetVariantSDKVersion()); if (RelaxAll) S->getAssembler().setRelaxAll(true); return S; diff --git a/llvm/lib/MC/MCNullStreamer.cpp b/llvm/lib/MC/MCNullStreamer.cpp index 291d840b4f4b..40b7eba58b03 100644 --- a/llvm/lib/MC/MCNullStreamer.cpp +++ b/llvm/lib/MC/MCNullStreamer.cpp @@ -40,6 +40,9 @@ namespace { void EmitCOFFSymbolStorageClass(int StorageClass) override {} void EmitCOFFSymbolType(int Type) override {} void EndCOFFSymbolDef() override {} + void + emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol, MCSymbolAttr Linkage, + MCSymbolAttr Visibility) override {} }; } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 9c86fcc86bcb..6604d7988c4c 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -281,6 +281,18 @@ void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { Symbol->setOffset(0); addPendingLabel(Symbol); } + + emitPendingAssignments(Symbol); +} + +void MCObjectStreamer::emitPendingAssignments(MCSymbol *Symbol) { + auto Assignments = pendingAssignments.find(Symbol); + if (Assignments != pendingAssignments.end()) { + for (const PendingAssignment &A : Assignments->second) + emitAssignment(A.Symbol, A.Value); + + pendingAssignments.erase(Assignments); + } } // Emit a label at a previously emitted fragment/offset position. This must be @@ -353,6 +365,19 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section, void MCObjectStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { getAssembler().registerSymbol(*Symbol); MCStreamer::emitAssignment(Symbol, Value); + emitPendingAssignments(Symbol); +} + +void MCObjectStreamer::emitConditionalAssignment(MCSymbol *Symbol, + const MCExpr *Value) { + const MCSymbol *Target = &cast<MCSymbolRefExpr>(*Value).getSymbol(); + + // If the symbol already exists, emit the assignment. Otherwise, emit it + // later only if the symbol is also emitted. + if (Target->isRegistered()) + emitAssignment(Symbol, Value); + else + pendingAssignments[Target].push_back({Symbol, Value}); } bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const { diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index ed9f2066dc20..705f7159d55b 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -356,8 +356,14 @@ private: /// return the contents from the current token up to the end or comma. StringRef parseStringToComma(); - bool parseAssignment(StringRef Name, bool allow_redef, - bool NoDeadStrip = false); + enum class AssignmentKind { + Set, + Equiv, + Equal, + LTOSetConditional, + }; + + bool parseAssignment(StringRef Name, AssignmentKind Kind); unsigned getBinOpPrecedence(AsmToken::TokenKind K, MCBinaryExpr::Opcode &Kind); @@ -534,6 +540,7 @@ private: DK_ADDRSIG_SYM, DK_PSEUDO_PROBE, DK_LTO_DISCARD, + DK_LTO_SET_CONDITIONAL, DK_END }; @@ -564,8 +571,8 @@ private: const fltSemantics &); // ".single", ... bool parseDirectiveFill(); // ".fill" bool parseDirectiveZero(); // ".zero" - // ".set", ".equ", ".equiv" - bool parseDirectiveSet(StringRef IDVal, bool allow_redef); + // ".set", ".equ", ".equiv", ".lto_set_conditional" + bool parseDirectiveSet(StringRef IDVal, AssignmentKind Kind); bool parseDirectiveOrg(); // ".org" // ".align{,32}", ".p2align{,w,l}" bool parseDirectiveAlign(bool IsPow2, unsigned ValueSize); @@ -1968,7 +1975,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, // identifier '=' ... -> assignment statement Lex(); - return parseAssignment(IDVal, true); + return parseAssignment(IDVal, AssignmentKind::Equal); default: // Normal instruction or directive. break; @@ -2027,9 +2034,11 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, break; case DK_SET: case DK_EQU: - return parseDirectiveSet(IDVal, true); + return parseDirectiveSet(IDVal, AssignmentKind::Set); case DK_EQUIV: - return parseDirectiveSet(IDVal, false); + return parseDirectiveSet(IDVal, AssignmentKind::Equiv); + case DK_LTO_SET_CONDITIONAL: + return parseDirectiveSet(IDVal, AssignmentKind::LTOSetConditional); case DK_ASCII: return parseDirectiveAscii(IDVal, false); case DK_ASCIZ: @@ -2925,11 +2934,13 @@ void AsmParser::handleMacroExit() { ActiveMacros.pop_back(); } -bool AsmParser::parseAssignment(StringRef Name, bool allow_redef, - bool NoDeadStrip) { +bool AsmParser::parseAssignment(StringRef Name, AssignmentKind Kind) { MCSymbol *Sym; const MCExpr *Value; - if (MCParserUtils::parseAssignmentExpression(Name, allow_redef, *this, Sym, + SMLoc ExprLoc = getTok().getLoc(); + bool AllowRedef = + Kind == AssignmentKind::Set || Kind == AssignmentKind::Equal; + if (MCParserUtils::parseAssignmentExpression(Name, AllowRedef, *this, Sym, Value)) return true; @@ -2944,9 +2955,22 @@ bool AsmParser::parseAssignment(StringRef Name, bool allow_redef, return false; // Do the assignment. - Out.emitAssignment(Sym, Value); - if (NoDeadStrip) + switch (Kind) { + case AssignmentKind::Equal: + Out.emitAssignment(Sym, Value); + break; + case AssignmentKind::Set: + case AssignmentKind::Equiv: + Out.emitAssignment(Sym, Value); Out.emitSymbolAttribute(Sym, MCSA_NoDeadStrip); + break; + case AssignmentKind::LTOSetConditional: + if (Value->getKind() != MCExpr::SymbolRef) + return Error(ExprLoc, "expected identifier"); + + Out.emitConditionalAssignment(Sym, Value); + break; + } return false; } @@ -2998,10 +3022,11 @@ bool AsmParser::parseIdentifier(StringRef &Res) { /// ::= .equ identifier ',' expression /// ::= .equiv identifier ',' expression /// ::= .set identifier ',' expression -bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) { +/// ::= .lto_set_conditional identifier ',' expression +bool AsmParser::parseDirectiveSet(StringRef IDVal, AssignmentKind Kind) { StringRef Name; if (check(parseIdentifier(Name), "expected identifier") || parseComma() || - parseAssignment(Name, allow_redef, true)) + parseAssignment(Name, Kind)) return true; return false; } @@ -5581,6 +5606,7 @@ void AsmParser::initializeDirectiveKindMap() { DirectiveKindMap[".addrsig_sym"] = DK_ADDRSIG_SYM; DirectiveKindMap[".pseudoprobe"] = DK_PSEUDO_PROBE; DirectiveKindMap[".lto_discard"] = DK_LTO_DISCARD; + DirectiveKindMap[".lto_set_conditional"] = DK_LTO_SET_CONDITIONAL; } MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) { @@ -6012,12 +6038,13 @@ bool AsmParser::parseMSInlineAsm( bool isOutput = (i == 1) && Desc.mayStore(); SMLoc Start = SMLoc::getFromPointer(SymName.data()); + int64_t Size = Operand.isMemPlaceholder(Desc) ? 0 : SymName.size(); if (isOutput) { ++InputIdx; OutputDecls.push_back(OpDecl); OutputDeclsAddressOf.push_back(Operand.needAddressOf()); OutputConstraints.push_back(("=" + Constraint).str()); - AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size()); + AsmStrRewrites.emplace_back(AOK_Output, Start, Size); } else { InputDecls.push_back(OpDecl); InputDeclsAddressOf.push_back(Operand.needAddressOf()); @@ -6025,7 +6052,7 @@ bool AsmParser::parseMSInlineAsm( if (Desc.OpInfo[i - 1].isBranchTarget()) AsmStrRewrites.emplace_back(AOK_CallInput, Start, SymName.size()); else - AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size()); + AsmStrRewrites.emplace_back(AOK_Input, Start, Size); } } @@ -6140,13 +6167,17 @@ bool AsmParser::parseMSInlineAsm( OS << Ctx.getAsmInfo()->getPrivateLabelPrefix() << AR.Label; break; case AOK_Input: - OS << '$' << InputIdx++; + if (AR.Len) + OS << '$' << InputIdx; + ++InputIdx; break; case AOK_CallInput: OS << "${" << InputIdx++ << ":P}"; break; case AOK_Output: - OS << '$' << OutputIdx++; + if (AR.Len) + OS << '$' << OutputIdx; + ++OutputIdx; break; case AOK_SizeDirective: switch (AR.Val) { diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp index e35bcec8fe75..ebf38327f4dc 100644 --- a/llvm/lib/MC/MCPseudoProbe.cpp +++ b/llvm/lib/MC/MCPseudoProbe.cpp @@ -151,8 +151,8 @@ void MCPseudoProbeInlineTree::emit(MCObjectStreamer *MCOS, // InlineSite is unique for each pair, // so there will be no ordering of Inlinee based on MCPseudoProbeInlineTree* std::map<InlineSite, MCPseudoProbeInlineTree *> Inlinees; - for (auto Child = Children.begin(); Child != Children.end(); ++Child) - Inlinees[Child->first] = Child->second.get(); + for (auto &Child : Children) + Inlinees[Child.first] = Child.second.get(); for (const auto &Inlinee : Inlinees) { if (Guid) { diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index f4e64b42c817..9c37a7bebe2a 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -431,6 +431,9 @@ void MCStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) { TS->emitLabel(Symbol); } +void MCStreamer::emitConditionalAssignment(MCSymbol *Symbol, + const MCExpr *Value) {} + void MCStreamer::emitCFISections(bool EH, bool Debug) {} void MCStreamer::emitCFIStartProc(bool IsSimple, SMLoc Loc) { @@ -1308,45 +1311,78 @@ getMachoBuildVersionPlatformType(const Triple &Target) { llvm_unreachable("unexpected OS type"); } -void MCStreamer::emitVersionForTarget(const Triple &Target, - const VersionTuple &SDKVersion) { +void MCStreamer::emitVersionForTarget( + const Triple &Target, const VersionTuple &SDKVersion, + const Triple *DarwinTargetVariantTriple, + const VersionTuple &DarwinTargetVariantSDKVersion) { if (!Target.isOSBinFormatMachO() || !Target.isOSDarwin()) return; // Do we even know the version? if (Target.getOSMajorVersion() == 0) return; - unsigned Major = 0; - unsigned Minor = 0; - unsigned Update = 0; + VersionTuple Version; switch (Target.getOS()) { case Triple::MacOSX: case Triple::Darwin: - Target.getMacOSXVersion(Major, Minor, Update); + Target.getMacOSXVersion(Version); break; case Triple::IOS: case Triple::TvOS: - Target.getiOSVersion(Major, Minor, Update); + Version = Target.getiOSVersion(); break; case Triple::WatchOS: - Target.getWatchOSVersion(Major, Minor, Update); + Version = Target.getWatchOSVersion(); break; default: llvm_unreachable("unexpected OS type"); } - assert(Major != 0 && "A non-zero major version is expected"); - auto LinkedTargetVersion = targetVersionOrMinimumSupportedOSVersion( - Target, VersionTuple(Major, Minor, Update)); + assert(Version.getMajor() != 0 && "A non-zero major version is expected"); + auto LinkedTargetVersion = + targetVersionOrMinimumSupportedOSVersion(Target, Version); auto BuildVersionOSVersion = getMachoBuildVersionSupportedOS(Target); + bool ShouldEmitBuildVersion = false; if (BuildVersionOSVersion.empty() || - LinkedTargetVersion >= BuildVersionOSVersion) - return emitBuildVersion(getMachoBuildVersionPlatformType(Target), - LinkedTargetVersion.getMajor(), - *LinkedTargetVersion.getMinor(), - *LinkedTargetVersion.getSubminor(), SDKVersion); + LinkedTargetVersion >= BuildVersionOSVersion) { + if (Target.isMacCatalystEnvironment() && DarwinTargetVariantTriple && + DarwinTargetVariantTriple->isMacOSX()) { + emitVersionForTarget(*DarwinTargetVariantTriple, + DarwinTargetVariantSDKVersion, + /*TargetVariantTriple=*/nullptr, + /*TargetVariantSDKVersion=*/VersionTuple()); + emitDarwinTargetVariantBuildVersion( + getMachoBuildVersionPlatformType(Target), + LinkedTargetVersion.getMajor(), + LinkedTargetVersion.getMinor().getValueOr(0), + LinkedTargetVersion.getSubminor().getValueOr(0), SDKVersion); + return; + } + emitBuildVersion(getMachoBuildVersionPlatformType(Target), + LinkedTargetVersion.getMajor(), + LinkedTargetVersion.getMinor().getValueOr(0), + LinkedTargetVersion.getSubminor().getValueOr(0), + SDKVersion); + ShouldEmitBuildVersion = true; + } + + if (const Triple *TVT = DarwinTargetVariantTriple) { + if (Target.isMacOSX() && TVT->isMacCatalystEnvironment()) { + auto TVLinkedTargetVersion = + targetVersionOrMinimumSupportedOSVersion(*TVT, TVT->getiOSVersion()); + emitDarwinTargetVariantBuildVersion( + getMachoBuildVersionPlatformType(*TVT), + TVLinkedTargetVersion.getMajor(), + TVLinkedTargetVersion.getMinor().getValueOr(0), + TVLinkedTargetVersion.getSubminor().getValueOr(0), + DarwinTargetVariantSDKVersion); + } + } + + if (ShouldEmitBuildVersion) + return; emitVersionMin(getMachoVersionMinLoadCommandType(Target), LinkedTargetVersion.getMajor(), - *LinkedTargetVersion.getMinor(), - *LinkedTargetVersion.getSubminor(), SDKVersion); + LinkedTargetVersion.getMinor().getValueOr(0), + LinkedTargetVersion.getSubminor().getValueOr(0), SDKVersion); } diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp index 7773d8828931..2a93c352c68a 100644 --- a/llvm/lib/MC/MCWin64EH.cpp +++ b/llvm/lib/MC/MCWin64EH.cpp @@ -351,7 +351,7 @@ static uint32_t ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction> Insns) { // Unwind opcode encodings and restrictions are documented at // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin, - WinEH::Instruction &inst) { + const WinEH::Instruction &inst) { uint8_t b, reg; switch (static_cast<Win64EH::UnwindOpcodes>(inst.Operation)) { default: @@ -1050,10 +1050,8 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info, // Emit epilog unwind instructions for (auto &I : info->EpilogMap) { auto &EpilogInstrs = I.second; - for (uint32_t i = 0; i < EpilogInstrs.size(); i++) { - WinEH::Instruction inst = EpilogInstrs[i]; + for (const WinEH::Instruction &inst : EpilogInstrs) ARM64EmitUnwindCode(streamer, info->Begin, inst); - } } int32_t BytesMod = CodeWords * 4 - TotalCodeBytes; diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 277d88cf1cd2..16941b1cb727 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -484,15 +484,15 @@ void MachObjectWriter::bindIndirectSymbols(MCAssembler &Asm) { // Report errors for use of .indirect_symbol not in a symbol pointer section // or stub section. - for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(), - ie = Asm.indirect_symbol_end(); it != ie; ++it) { - const MCSectionMachO &Section = cast<MCSectionMachO>(*it->Section); + for (IndirectSymbolData &ISD : llvm::make_range(Asm.indirect_symbol_begin(), + Asm.indirect_symbol_end())) { + const MCSectionMachO &Section = cast<MCSectionMachO>(*ISD.Section); if (Section.getType() != MachO::S_NON_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_LAZY_SYMBOL_POINTERS && Section.getType() != MachO::S_THREAD_LOCAL_VARIABLE_POINTERS && Section.getType() != MachO::S_SYMBOL_STUBS) { - MCSymbol &Symbol = *it->Symbol; + MCSymbol &Symbol = *ISD.Symbol; report_fatal_error("indirect symbol '" + Symbol.getName() + "' not in a symbol pointer or stub section"); } @@ -779,6 +779,17 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm, LoadCommandsSize += sizeof(MachO::version_min_command); } + const MCAssembler::VersionInfoType &TargetVariantVersionInfo = + Layout.getAssembler().getDarwinTargetVariantVersionInfo(); + + // Add the target variant version info load command size, if used. + if (TargetVariantVersionInfo.Major != 0) { + ++NumLoadCommands; + assert(TargetVariantVersionInfo.EmitBuildVersion && + "target variant should use build version"); + LoadCommandsSize += sizeof(MachO::build_version_command); + } + // Add the data-in-code load command size, if used. unsigned NumDataRegions = Asm.getDataRegions().size(); if (NumDataRegions) { @@ -862,38 +873,43 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm, } // Write out the deployment target information, if it's available. - if (VersionInfo.Major != 0) { - auto EncodeVersion = [](VersionTuple V) -> uint32_t { - assert(!V.empty() && "empty version"); - unsigned Update = V.getSubminor() ? *V.getSubminor() : 0; - unsigned Minor = V.getMinor() ? *V.getMinor() : 0; - assert(Update < 256 && "unencodable update target version"); - assert(Minor < 256 && "unencodable minor target version"); - assert(V.getMajor() < 65536 && "unencodable major target version"); - return Update | (Minor << 8) | (V.getMajor() << 16); - }; - uint32_t EncodedVersion = EncodeVersion( - VersionTuple(VersionInfo.Major, VersionInfo.Minor, VersionInfo.Update)); - uint32_t SDKVersion = !VersionInfo.SDKVersion.empty() - ? EncodeVersion(VersionInfo.SDKVersion) - : 0; - if (VersionInfo.EmitBuildVersion) { - // FIXME: Currently empty tools. Add clang version in the future. - W.write<uint32_t>(MachO::LC_BUILD_VERSION); - W.write<uint32_t>(sizeof(MachO::build_version_command)); - W.write<uint32_t>(VersionInfo.TypeOrPlatform.Platform); - W.write<uint32_t>(EncodedVersion); - W.write<uint32_t>(SDKVersion); - W.write<uint32_t>(0); // Empty tools list. - } else { - MachO::LoadCommandType LCType - = getLCFromMCVM(VersionInfo.TypeOrPlatform.Type); - W.write<uint32_t>(LCType); - W.write<uint32_t>(sizeof(MachO::version_min_command)); - W.write<uint32_t>(EncodedVersion); - W.write<uint32_t>(SDKVersion); - } - } + auto EmitDeploymentTargetVersion = + [&](const MCAssembler::VersionInfoType &VersionInfo) { + auto EncodeVersion = [](VersionTuple V) -> uint32_t { + assert(!V.empty() && "empty version"); + unsigned Update = V.getSubminor().getValueOr(0); + unsigned Minor = V.getMinor().getValueOr(0); + assert(Update < 256 && "unencodable update target version"); + assert(Minor < 256 && "unencodable minor target version"); + assert(V.getMajor() < 65536 && "unencodable major target version"); + return Update | (Minor << 8) | (V.getMajor() << 16); + }; + uint32_t EncodedVersion = EncodeVersion(VersionTuple( + VersionInfo.Major, VersionInfo.Minor, VersionInfo.Update)); + uint32_t SDKVersion = !VersionInfo.SDKVersion.empty() + ? EncodeVersion(VersionInfo.SDKVersion) + : 0; + if (VersionInfo.EmitBuildVersion) { + // FIXME: Currently empty tools. Add clang version in the future. + W.write<uint32_t>(MachO::LC_BUILD_VERSION); + W.write<uint32_t>(sizeof(MachO::build_version_command)); + W.write<uint32_t>(VersionInfo.TypeOrPlatform.Platform); + W.write<uint32_t>(EncodedVersion); + W.write<uint32_t>(SDKVersion); + W.write<uint32_t>(0); // Empty tools list. + } else { + MachO::LoadCommandType LCType = + getLCFromMCVM(VersionInfo.TypeOrPlatform.Type); + W.write<uint32_t>(LCType); + W.write<uint32_t>(sizeof(MachO::version_min_command)); + W.write<uint32_t>(EncodedVersion); + W.write<uint32_t>(SDKVersion); + } + }; + if (VersionInfo.Major != 0) + EmitDeploymentTargetVersion(VersionInfo); + if (TargetVariantVersionInfo.Major != 0) + EmitDeploymentTargetVersion(TargetVariantVersionInfo); // Write the data-in-code load command, if used. uint64_t DataInCodeTableEnd = RelocTableEnd + NumDataRegions * 8; diff --git a/llvm/lib/MC/TargetRegistry.cpp b/llvm/lib/MC/TargetRegistry.cpp index 0948a6b9f1a1..09684b1e5ad2 100644 --- a/llvm/lib/MC/TargetRegistry.cpp +++ b/llvm/lib/MC/TargetRegistry.cpp @@ -124,10 +124,10 @@ void TargetRegistry::printRegisteredTargetsForVersion(raw_ostream &OS) { array_pod_sort(Targets.begin(), Targets.end(), TargetArraySortFn); OS << " Registered Targets:\n"; - for (unsigned i = 0, e = Targets.size(); i != e; ++i) { - OS << " " << Targets[i].first; - OS.indent(Width - Targets[i].first.size()) << " - " - << Targets[i].second->getShortDescription() << '\n'; + for (const auto &Target : Targets) { + OS << " " << Target.first; + OS.indent(Width - Target.first.size()) + << " - " << Target.second->getShortDescription() << '\n'; } if (Targets.empty()) OS << " (none)\n"; diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp index ce997464caa7..da8bcec7f3d4 100644 --- a/llvm/lib/Object/ArchiveWriter.cpp +++ b/llvm/lib/Object/ArchiveWriter.cpp @@ -696,7 +696,7 @@ writeArchiveToBuffer(ArrayRef<NewArchiveMember> NewMembers, bool WriteSymtab, return std::move(E); return std::make_unique<SmallVectorMemoryBuffer>( - std::move(ArchiveBufferVector)); + std::move(ArchiveBufferVector), /*RequiresNullTerminator=*/false); } } // namespace llvm diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index 84181ae5e501..6e56da1a31f3 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -210,6 +210,8 @@ uint32_t llvm::object::getELFRelativeRelocationType(uint32_t Machine) { return ELF::R_SPARC_RELATIVE; case ELF::EM_CSKY: return ELF::R_CKCORE_RELATIVE; + case ELF::EM_VE: + return ELF::R_VE_RELATIVE; case ELF::EM_AMDGPU: break; case ELF::EM_BPF: diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp index 7501661591f0..42e257516f4e 100644 --- a/llvm/lib/Object/MachOObjectFile.cpp +++ b/llvm/lib/Object/MachOObjectFile.cpp @@ -26,12 +26,15 @@ #include "llvm/Object/SymbolicFile.h" #include "llvm/Support/DataExtractor.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" #include "llvm/Support/Host.h" #include "llvm/Support/LEB128.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" #include "llvm/Support/SwapByteOrder.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> @@ -4719,3 +4722,46 @@ StringRef MachOObjectFile::mapDebugSectionName(StringRef Name) const { .Case("debug_str_offs", "debug_str_offsets") .Default(Name); } + +Expected<std::vector<std::string>> +MachOObjectFile::findDsymObjectMembers(StringRef Path) { + SmallString<256> BundlePath(Path); + // Normalize input path. This is necessary to accept `bundle.dSYM/`. + sys::path::remove_dots(BundlePath); + if (!sys::fs::is_directory(BundlePath) || + sys::path::extension(BundlePath) != ".dSYM") + return std::vector<std::string>(); + sys::path::append(BundlePath, "Contents", "Resources", "DWARF"); + bool IsDir; + auto EC = sys::fs::is_directory(BundlePath, IsDir); + if (EC == errc::no_such_file_or_directory || (!EC && !IsDir)) + return createStringError( + EC, "%s: expected directory 'Contents/Resources/DWARF' in dSYM bundle", + Path.str().c_str()); + if (EC) + return createFileError(BundlePath, errorCodeToError(EC)); + + std::vector<std::string> ObjectPaths; + for (sys::fs::directory_iterator Dir(BundlePath, EC), DirEnd; + Dir != DirEnd && !EC; Dir.increment(EC)) { + StringRef ObjectPath = Dir->path(); + sys::fs::file_status Status; + if (auto EC = sys::fs::status(ObjectPath, Status)) + return createFileError(ObjectPath, errorCodeToError(EC)); + switch (Status.type()) { + case sys::fs::file_type::regular_file: + case sys::fs::file_type::symlink_file: + case sys::fs::file_type::type_unknown: + ObjectPaths.push_back(ObjectPath.str()); + break; + default: /*ignore*/; + } + } + if (EC) + return createFileError(BundlePath, errorCodeToError(EC)); + if (ObjectPaths.empty()) + return createStringError(std::error_code(), + "%s: no objects found in dSYM bundle", + Path.str().c_str()); + return ObjectPaths; +} diff --git a/llvm/lib/Object/MachOUniversalWriter.cpp b/llvm/lib/Object/MachOUniversalWriter.cpp index 9673c97a10f0..ae1ff09a4f8f 100644 --- a/llvm/lib/Object/MachOUniversalWriter.cpp +++ b/llvm/lib/Object/MachOUniversalWriter.cpp @@ -19,7 +19,6 @@ #include "llvm/Object/IRObjectFile.h" #include "llvm/Object/MachO.h" #include "llvm/Object/MachOUniversal.h" -#include "llvm/Support/SmallVectorMemoryBuffer.h" using namespace llvm; using namespace object; diff --git a/llvm/lib/ObjectYAML/COFFEmitter.cpp b/llvm/lib/ObjectYAML/COFFEmitter.cpp index 66ad16db1ba4..d884e2fd55cd 100644 --- a/llvm/lib/ObjectYAML/COFFEmitter.cpp +++ b/llvm/lib/ObjectYAML/COFFEmitter.cpp @@ -64,11 +64,7 @@ struct COFFParser { } bool parseSections() { - for (std::vector<COFFYAML::Section>::iterator i = Obj.Sections.begin(), - e = Obj.Sections.end(); - i != e; ++i) { - COFFYAML::Section &Sec = *i; - + for (COFFYAML::Section &Sec : Obj.Sections) { // If the name is less than 8 bytes, store it in place, otherwise // store it in the string table. StringRef Name = Sec.Name; @@ -103,11 +99,7 @@ struct COFFParser { } bool parseSymbols() { - for (std::vector<COFFYAML::Symbol>::iterator i = Obj.Symbols.begin(), - e = Obj.Symbols.end(); - i != e; ++i) { - COFFYAML::Symbol &Sym = *i; - + for (COFFYAML::Symbol &Sym : Obj.Symbols) { // If the name is less than 8 bytes, store it in place, otherwise // store it in the string table. StringRef Name = Sym.Name; diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index e0dde4433d24..9b9266998ea6 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -464,29 +464,31 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH); break; case ELF::EM_HEXAGON: - BCase(EF_HEXAGON_MACH_V2); - BCase(EF_HEXAGON_MACH_V3); - BCase(EF_HEXAGON_MACH_V4); - BCase(EF_HEXAGON_MACH_V5); - BCase(EF_HEXAGON_MACH_V55); - BCase(EF_HEXAGON_MACH_V60); - BCase(EF_HEXAGON_MACH_V62); - BCase(EF_HEXAGON_MACH_V65); - BCase(EF_HEXAGON_MACH_V66); - BCase(EF_HEXAGON_MACH_V67); - BCase(EF_HEXAGON_MACH_V67T); - BCase(EF_HEXAGON_MACH_V68); - BCase(EF_HEXAGON_ISA_V2); - BCase(EF_HEXAGON_ISA_V3); - BCase(EF_HEXAGON_ISA_V4); - BCase(EF_HEXAGON_ISA_V5); - BCase(EF_HEXAGON_ISA_V55); - BCase(EF_HEXAGON_ISA_V60); - BCase(EF_HEXAGON_ISA_V62); - BCase(EF_HEXAGON_ISA_V65); - BCase(EF_HEXAGON_ISA_V66); - BCase(EF_HEXAGON_ISA_V67); - BCase(EF_HEXAGON_ISA_V68); + BCaseMask(EF_HEXAGON_MACH_V2, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V3, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V4, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V5, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V55, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V60, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V62, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V65, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V66, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V67, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V67T, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V68, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_MACH_V69, EF_HEXAGON_MACH); + BCaseMask(EF_HEXAGON_ISA_V2, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V3, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V4, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V5, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V55, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V60, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V62, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V65, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V66, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V67, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V68, EF_HEXAGON_ISA); + BCaseMask(EF_HEXAGON_ISA_V69, EF_HEXAGON_ISA); break; case ELF::EM_AVR: BCaseMask(EF_AVR_ARCH_AVR1, EF_AVR_ARCH_MASK); diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp index 85d1f82bfafc..cf0d058c518c 100644 --- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp +++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp @@ -86,13 +86,13 @@ bool XCOFFWriter::nameShouldBeInStringTable(StringRef SymbolName) { } bool XCOFFWriter::initRelocations(uint64_t &CurrentOffset) { - for (uint16_t I = 0, E = InitSections.size(); I < E; ++I) { - if (!InitSections[I].Relocations.empty()) { - InitSections[I].NumberOfRelocations = InitSections[I].Relocations.size(); - InitSections[I].FileOffsetToRelocations = CurrentOffset; + for (XCOFFYAML::Section &InitSection : InitSections) { + if (!InitSection.Relocations.empty()) { + InitSection.NumberOfRelocations = InitSection.Relocations.size(); + InitSection.FileOffsetToRelocations = CurrentOffset; uint64_t RelSize = Is64Bit ? XCOFF::RelocationSerializationSize64 : XCOFF::RelocationSerializationSize32; - CurrentOffset += InitSections[I].NumberOfRelocations * RelSize; + CurrentOffset += InitSection.NumberOfRelocations * RelSize; if (CurrentOffset > MaxRawDataSize) { ErrHandler("maximum object size of" + Twine(MaxRawDataSize) + "exceeded when writing relocation data"); diff --git a/llvm/lib/ObjectYAML/YAML.cpp b/llvm/lib/ObjectYAML/YAML.cpp index 5dcb113d3395..54e8c627d5a1 100644 --- a/llvm/lib/ObjectYAML/YAML.cpp +++ b/llvm/lib/ObjectYAML/YAML.cpp @@ -30,9 +30,8 @@ StringRef yaml::ScalarTraits<yaml::BinaryRef>::input(StringRef Scalar, void *, return "BinaryRef hex string must contain an even number of nybbles."; // TODO: Can we improve YAMLIO to permit a more accurate diagnostic here? // (e.g. a caret pointing to the offending character). - for (unsigned I = 0, N = Scalar.size(); I != N; ++I) - if (!llvm::isHexDigit(Scalar[I])) - return "BinaryRef hex string must contain only hex digits."; + if (!llvm::all_of(Scalar, llvm::isHexDigit)) + return "BinaryRef hex string must contain only hex digits."; Val = yaml::BinaryRef(Scalar); return {}; } diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp index 19e05b9272bb..c93b7ad7f5fa 100644 --- a/llvm/lib/Option/OptTable.cpp +++ b/llvm/lib/Option/OptTable.cpp @@ -591,16 +591,16 @@ static void PrintHelpOptionList(raw_ostream &OS, StringRef Title, // Find the maximum option length. unsigned OptionFieldWidth = 0; - for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) { + for (const OptionInfo &Opt : OptionHelp) { // Limit the amount of padding we are willing to give up for alignment. - unsigned Length = OptionHelp[i].Name.size(); + unsigned Length = Opt.Name.size(); if (Length <= 23) OptionFieldWidth = std::max(OptionFieldWidth, Length); } const unsigned InitialPad = 2; - for (unsigned i = 0, e = OptionHelp.size(); i != e; ++i) { - const std::string &Option = OptionHelp[i].Name; + for (const OptionInfo &Opt : OptionHelp) { + const std::string &Option = Opt.Name; int Pad = OptionFieldWidth - int(Option.size()); OS.indent(InitialPad) << Option; @@ -609,7 +609,7 @@ static void PrintHelpOptionList(raw_ostream &OS, StringRef Title, OS << "\n"; Pad = OptionFieldWidth + InitialPad; } - OS.indent(Pad + 1) << OptionHelp[i].HelpText << '\n'; + OS.indent(Pad + 1) << Opt.HelpText << '\n'; } } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 561a881bab0c..d7615ef4e9bf 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CostModel.h" +#include "llvm/Analysis/CycleAnalysis.h" #include "llvm/Analysis/DDG.h" #include "llvm/Analysis/DDGPrinter.h" #include "llvm/Analysis/Delinearization.h" @@ -151,6 +152,7 @@ #include "llvm/Transforms/Scalar/DeadStoreElimination.h" #include "llvm/Transforms/Scalar/DivRemPairs.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Scalar/Float2Int.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/GuardWidening.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index de1b0ace7876..a6a36ff25402 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -178,6 +178,10 @@ static cl::opt<bool> EnableNoRerunSimplificationPipeline( "than once in the case that SCC mutations cause a function to be " "visited multiple times as long as the function has not been changed")); +static cl::opt<bool> EnableMergeFunctions( + "enable-merge-functions", cl::init(false), cl::Hidden, + cl::desc("Enable function merging as part of the optimization pipeline")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; @@ -187,7 +191,7 @@ PipelineTuningOptions::PipelineTuningOptions() { LicmMssaOptCap = SetLicmMssaOptCap; LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; CallGraphProfile = true; - MergeFunctions = false; + MergeFunctions = EnableMergeFunctions; EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; } @@ -418,9 +422,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(CorrelatedValuePropagationPass()); FPM.addPass(SimplifyCFGPass()); + FPM.addPass(InstCombinePass()); if (Level == OptimizationLevel::O3) FPM.addPass(AggressiveInstCombinePass()); - FPM.addPass(InstCombinePass()); if (!Level.isOptimizingForSize()) FPM.addPass(LibCallsShrinkWrapPass()); @@ -754,9 +758,11 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level, return MIWP; } -ModuleInlinerPass +ModulePassManager PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, ThinOrFullLTOPhase Phase) { + ModulePassManager MPM; + InlineParams IP = getInlineParamsFromOptLevel(Level); if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) @@ -773,7 +779,16 @@ PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, // inline deferral logic in module inliner. IP.EnableDeferral = false; - return ModuleInlinerPass(IP, UseInlineAdvisor); + MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor)); + + MPM.addPass(createModuleToFunctionPassAdaptor( + buildFunctionSimplificationPipeline(Level, Phase), + PTO.EagerlyInvalidateAnalyses)); + + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( + CoroSplitPass(Level != OptimizationLevel::O0))); + + return MPM; } ModulePassManager @@ -980,26 +995,28 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, FPM.addPass(InstCombinePass()); if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { + ExtraVectorPassManager ExtraPasses; // At higher optimization levels, try to clean up any runtime overlap and // alignment checks inserted by the vectorizer. We want to track correlated // runtime checks for two inner loops in the same outer loop, fold any // common computations, hoist loop-invariant aspects out of any outer loop, // and unswitch the runtime checks if possible. Once hoisted, we may have // dead (or speculatable) control flows or more combining opportunities. - FPM.addPass(EarlyCSEPass()); - FPM.addPass(CorrelatedValuePropagationPass()); - FPM.addPass(InstCombinePass()); + ExtraPasses.addPass(EarlyCSEPass()); + ExtraPasses.addPass(CorrelatedValuePropagationPass()); + ExtraPasses.addPass(InstCombinePass()); LoopPassManager LPM; LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); - FPM.addPass( + ExtraPasses.addPass( RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); - FPM.addPass( + ExtraPasses.addPass( createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); - FPM.addPass(SimplifyCFGPass()); - FPM.addPass(InstCombinePass()); + ExtraPasses.addPass(SimplifyCFGPass()); + ExtraPasses.addPass(InstCombinePass()); + FPM.addPass(std::move(ExtraPasses)); } // Now that we've formed fast to execute loop structures, we do further @@ -1149,8 +1166,9 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, // Disable header duplication at -Oz. LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink)); // Some loops may have become dead by now. Try to delete them. - // FIXME: see disscussion in https://reviews.llvm.org/D112851 - // this may need to be revisited once GVN is more powerful. + // FIXME: see discussion in https://reviews.llvm.org/D112851, + // this may need to be revisited once we run GVN before loop deletion + // in the simplification pipeline. LPM.addPass(LoopDeletionPass()); OptimizePM.addPass(createFunctionToLoopPassAdaptor( std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); @@ -1167,23 +1185,6 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); - // Split out cold code. Splitting is done late to avoid hiding context from - // other optimizations and inadvertently regressing performance. The tradeoff - // is that this has a higher code size cost than splitting early. - if (EnableHotColdSplit && !LTOPreLink) - MPM.addPass(HotColdSplittingPass()); - - // Search the code for similar regions of code. If enough similar regions can - // be found where extracting the regions into their own function will decrease - // the size of the program, we extract the regions, a deduplicate the - // structurally similar regions. - if (EnableIROutliner) - MPM.addPass(IROutlinerPass()); - - // Merge functions if requested. - if (PTO.MergeFunctions) - MPM.addPass(MergeFunctionsPass()); - // LoopSink pass sinks instructions hoisted by LICM, which serves as a // canonicalization pass that enables other optimizations. As a result, // LoopSink pass needs to be a very late IR pass to avoid undoing LICM @@ -1211,6 +1212,23 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, for (auto &C : OptimizerLastEPCallbacks) C(MPM, Level); + // Split out cold code. Splitting is done late to avoid hiding context from + // other optimizations and inadvertently regressing performance. The tradeoff + // is that this has a higher code size cost than splitting early. + if (EnableHotColdSplit && !LTOPreLink) + MPM.addPass(HotColdSplittingPass()); + + // Search the code for similar regions of code. If enough similar regions can + // be found where extracting the regions into their own function will decrease + // the size of the program, we extract the regions, a deduplicate the + // structurally similar regions. + if (EnableIROutliner) + MPM.addPass(IROutlinerPass()); + + // Merge functions if requested. + if (PTO.MergeFunctions) + MPM.addPass(MergeFunctionsPass()); + if (PTO.CallGraphProfile) MPM.addPass(CGProfilePass()); @@ -1521,9 +1539,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, // function pointers. When this happens, we often have to resolve varargs // calls, etc, so let instcombine do this. FunctionPassManager PeepholeFPM; + PeepholeFPM.addPass(InstCombinePass()); if (Level == OptimizationLevel::O3) PeepholeFPM.addPass(AggressiveInstCombinePass()); - PeepholeFPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(PeepholeFPM, Level); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index c2032b5b8276..74613a7fcce0 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -185,6 +185,7 @@ FUNCTION_ANALYSIS("aa", AAManager()) FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis()) FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis()) FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis()) +FUNCTION_ANALYSIS("cycles", CycleAnalysis()) FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis()) FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis()) FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis()) @@ -202,6 +203,7 @@ FUNCTION_ANALYSIS("no-op-function", NoOpFunctionAnalysis()) FUNCTION_ANALYSIS("opt-remark-emit", OptimizationRemarkEmitterAnalysis()) FUNCTION_ANALYSIS("scalar-evolution", ScalarEvolutionAnalysis()) FUNCTION_ANALYSIS("should-not-run-function-passes", ShouldNotRunFunctionPassesAnalysis()) +FUNCTION_ANALYSIS("should-run-extra-vector-passes", ShouldRunExtraVectorPasses()) FUNCTION_ANALYSIS("stack-safety-local", StackSafetyAnalysis()) FUNCTION_ANALYSIS("targetlibinfo", TargetLibraryAnalysis()) FUNCTION_ANALYSIS("targetir", @@ -253,6 +255,7 @@ FUNCTION_PASS("dse", DSEPass()) FUNCTION_PASS("dot-cfg", CFGPrinterPass()) FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass()) FUNCTION_PASS("fix-irreducible", FixIrreduciblePass()) +FUNCTION_PASS("flattencfg", FlattenCFGPass()) FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass()) FUNCTION_PASS("gvn-hoist", GVNHoistPass()) FUNCTION_PASS("gvn-sink", GVNSinkPass()) @@ -303,6 +306,7 @@ FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs())) FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs())) FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(dbgs())) FUNCTION_PASS("print<cost-model>", CostModelPrinterPass(dbgs())) +FUNCTION_PASS("print<cycles>", CycleInfoPrinterPass(dbgs())) FUNCTION_PASS("print<da>", DependenceAnalysisPrinterPass(dbgs())) FUNCTION_PASS("print<divergence>", DivergenceAnalysisPrinterPass(dbgs())) FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs())) diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 27a6c519ff82..23c825c78713 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -1262,11 +1262,6 @@ void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) { namespace { -enum IRChangeDiffType { InBefore, InAfter, IsCommon, NumIRChangeDiffTypes }; - -// Describe where a given element exists. -std::string Colours[NumIRChangeDiffTypes]; - class DisplayNode; class DotCfgDiffDisplayGraph; @@ -1274,19 +1269,19 @@ class DotCfgDiffDisplayGraph; class DisplayElement { public: // Is this in before, after, or both? - IRChangeDiffType getType() const { return Type; } + StringRef getColour() const { return Colour; } protected: - DisplayElement(IRChangeDiffType T) : Type(T) {} - const IRChangeDiffType Type; + DisplayElement(StringRef Colour) : Colour(Colour) {} + const StringRef Colour; }; // An edge representing a transition between basic blocks in the // dot-cfg-changes graph. class DisplayEdge : public DisplayElement { public: - DisplayEdge(std::string V, DisplayNode &Node, IRChangeDiffType T) - : DisplayElement(T), Value(V), Node(Node) {} + DisplayEdge(std::string Value, DisplayNode &Node, StringRef Colour) + : DisplayElement(Colour), Value(Value), Node(Node) {} // The value on which the transition is made. std::string getValue() const { return Value; } // The node (representing a basic block) reached by this transition. @@ -1302,8 +1297,8 @@ class DisplayNode : public DisplayElement { public: // \p C is the content for the node, \p T indicates the colour for the // outline of the node - DisplayNode(std::string C, IRChangeDiffType T) - : DisplayElement(T), Content(C) {} + DisplayNode(std::string Content, StringRef Colour) + : DisplayElement(Colour), Content(Content) {} // Iterator to the child nodes. Required by GraphWriter. using ChildIterator = std::unordered_set<DisplayNode *>::const_iterator; @@ -1315,13 +1310,13 @@ public: EdgeIterator edges_begin() const { return EdgePtrs.cbegin(); } EdgeIterator edges_end() const { return EdgePtrs.cend(); } - // Create an edge to \p Node on value \p V, with type \p T. - void createEdge(StringRef V, DisplayNode &Node, IRChangeDiffType T); + // Create an edge to \p Node on value \p Value, with colour \p Colour. + void createEdge(StringRef Value, DisplayNode &Node, StringRef Colour); // Return the content of this node. std::string getContent() const { return Content; } - // Return the type of the edge to node \p S. + // Return the edge to node \p S. const DisplayEdge &getEdge(const DisplayNode &To) const { assert(EdgeMap.find(&To) != EdgeMap.end() && "Expected to find edge."); return *EdgeMap.find(&To)->second; @@ -1383,9 +1378,9 @@ public: } // Create a node. - void createNode(std::string C, IRChangeDiffType T) { + void createNode(std::string C, StringRef Colour) { assert(!NodeGenerationComplete && "Unexpected node creation"); - Nodes.emplace_back(C, T); + Nodes.emplace_back(C, Colour); } // Return the node at index \p N to avoid problems with vectors reallocating. DisplayNode &getNode(unsigned N) { @@ -1408,13 +1403,13 @@ public: // Return a string with colour information for Dot. Required by GraphWriter. std::string getNodeAttributes(const DisplayNode &Node) const { - return attribute(Node.getType()); + return attribute(Node.getColour()); } // Return a string with colour information for Dot. Required by GraphWriter. std::string getEdgeColorAttr(const DisplayNode &From, const DisplayNode &To) const { - return attribute(From.getEdge(To).getType()); + return attribute(From.getEdge(To).getColour()); } // Get the starting basic block. Required by GraphWriter. @@ -1425,7 +1420,9 @@ public: protected: // Return the string containing the colour to use as a Dot attribute. - std::string attribute(IRChangeDiffType T) const; + std::string attribute(StringRef Colour) const { + return "color=" + Colour.str(); + } bool NodeGenerationComplete = false; const std::string GraphName; @@ -1434,10 +1431,10 @@ protected: DisplayNode *EntryNode = nullptr; }; -void DisplayNode::createEdge(StringRef V, DisplayNode &Node, - IRChangeDiffType T) { +void DisplayNode::createEdge(StringRef Value, DisplayNode &Node, + StringRef Colour) { assert(!AllEdgesCreated && "Expected to be able to still create edges."); - Edges.emplace_back(V.str(), Node, T); + Edges.emplace_back(Value.str(), Node, Colour); Children.insert(&Node); } @@ -1458,13 +1455,14 @@ public: DotCfgDiffNode() = delete; // Create a node in Dot difference graph \p G representing the basic block - // represented by \p BD with type \p T (where it exists). + // represented by \p BD with colour \p Colour (where it exists). DotCfgDiffNode(DotCfgDiff &G, unsigned N, const BlockDataT<DCData> &BD, - IRChangeDiffType T) - : Graph(G), N(N), Data{&BD, nullptr}, Type(T) {} + StringRef Colour) + : Graph(G), N(N), Data{&BD, nullptr}, Colour(Colour) {} DotCfgDiffNode(const DotCfgDiffNode &DN) - : Graph(DN.Graph), N(DN.N), Data{DN.Data[0], DN.Data[1]}, Type(DN.Type), - EdgesMap(DN.EdgesMap), Children(DN.Children), Edges(DN.Edges) {} + : Graph(DN.Graph), N(DN.N), Data{DN.Data[0], DN.Data[1]}, + Colour(DN.Colour), EdgesMap(DN.EdgesMap), Children(DN.Children), + Edges(DN.Edges) {} unsigned getIndex() const { return N; } @@ -1473,29 +1471,29 @@ public: assert(Data[0] && "Expected Data[0] to be set."); return Data[0]->getLabel(); } - // Return where this block exists. - IRChangeDiffType getType() const { return Type; } + // Return the colour for this block + StringRef getColour() const { return Colour; } // Change this basic block from being only in before to being common. // Save the pointer to \p Other. void setCommon(const BlockDataT<DCData> &Other) { assert(!Data[1] && "Expected only one block datum"); Data[1] = &Other; - Type = IsCommon; + Colour = CommonColour; } - // Add an edge to \p E of type {\p Value, \p T}. - void addEdge(unsigned E, StringRef Value, IRChangeDiffType T) { + // Add an edge to \p E of colour {\p Value, \p Colour}. + void addEdge(unsigned E, StringRef Value, StringRef Colour) { // This is a new edge or it is an edge being made common. - assert((EdgesMap.count(E) == 0 || T == IsCommon) && - "Unexpected edge count and type."); - EdgesMap[E] = {Value.str(), T}; + assert((EdgesMap.count(E) == 0 || Colour == CommonColour) && + "Unexpected edge count and color."); + EdgesMap[E] = {Value.str(), Colour}; } // Record the children and create edges. void finalize(DotCfgDiff &G); - // Return the type of the edge to node \p S. - std::pair<std::string, IRChangeDiffType> getEdge(const unsigned S) const { + // Return the colour of the edge to node \p S. + StringRef getEdgeColour(const unsigned S) const { assert(EdgesMap.count(S) == 1 && "Expected to find edge."); - return EdgesMap.at(S); + return EdgesMap.at(S).second; } // Return the string representing the basic block. @@ -1508,8 +1506,8 @@ protected: DotCfgDiff &Graph; const unsigned N; const BlockDataT<DCData> *Data[2]; - IRChangeDiffType Type; - std::map<const unsigned, std::pair<std::string, IRChangeDiffType>> EdgesMap; + StringRef Colour; + std::map<const unsigned, std::pair<std::string, StringRef>> EdgesMap; std::vector<unsigned> Children; std::vector<unsigned> Edges; }; @@ -1552,12 +1550,11 @@ public: protected: // Return the string surrounded by HTML to make it the appropriate colour. - std::string colourize(std::string S, IRChangeDiffType T) const; + std::string colourize(std::string S, StringRef Colour) const; - void createNode(StringRef Label, const BlockDataT<DCData> &BD, - IRChangeDiffType T) { + void createNode(StringRef Label, const BlockDataT<DCData> &BD, StringRef C) { unsigned Pos = Nodes.size(); - Nodes.emplace_back(*this, Pos, BD, T); + Nodes.emplace_back(*this, Pos, BD, C); NodePosition.insert({Label, Pos}); } @@ -1572,7 +1569,7 @@ protected: }; std::string DotCfgDiffNode::getBodyContent() const { - if (Type == IsCommon) { + if (Colour == CommonColour) { assert(Data[1] && "Expected Data[1] to be set."); StringRef SR[2]; @@ -1586,11 +1583,11 @@ std::string DotCfgDiffNode::getBodyContent() const { } SmallString<80> OldLineFormat = formatv( - "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[InBefore]); + "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", BeforeColour); SmallString<80> NewLineFormat = formatv( - "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[InAfter]); + "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", AfterColour); SmallString<80> UnchangedLineFormat = formatv( - "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", Colours[IsCommon]); + "<FONT COLOR=\"{0}\">%l</FONT><BR align=\"left\"/>", CommonColour); std::string Diff = Data[0]->getLabel().str(); Diff += ":\n<BR align=\"left\"/>" + doSystemDiff(makeHTMLReady(SR[0]), makeHTMLReady(SR[1]), @@ -1625,7 +1622,7 @@ std::string DotCfgDiffNode::getBodyContent() const { // drop predecessors as they can be big and are redundant BS1 = BS1.drop_until([](char C) { return C == '\n'; }).drop_front(); - std::string S = "<FONT COLOR=\"" + Colours[Type] + "\">" + Label.str() + ":"; + std::string S = "<FONT COLOR=\"" + Colour.str() + "\">" + Label.str() + ":"; // align each line to the left. while (BS1.size()) { @@ -1638,26 +1635,22 @@ std::string DotCfgDiffNode::getBodyContent() const { return S; } -std::string DotCfgDiff::colourize(std::string S, IRChangeDiffType T) const { +std::string DotCfgDiff::colourize(std::string S, StringRef Colour) const { if (S.length() == 0) return S; - return "<FONT COLOR=\"" + Colours[T] + "\">" + S + "</FONT>"; -} - -std::string DotCfgDiffDisplayGraph::attribute(IRChangeDiffType T) const { - return "color=" + Colours[T]; + return "<FONT COLOR=\"" + Colour.str() + "\">" + S + "</FONT>"; } DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before, const FuncDataT<DCData> &After) : GraphName(Title.str()) { - StringMap<IRChangeDiffType> EdgesMap; + StringMap<StringRef> EdgesMap; // Handle each basic block in the before IR. for (auto &B : Before.getData()) { StringRef Label = B.getKey(); const BlockDataT<DCData> &BD = B.getValue(); - createNode(Label, BD, InBefore); + createNode(Label, BD, BeforeColour); // Create transitions with names made up of the from block label, the value // on which the transition is made and the to block label. @@ -1666,7 +1659,7 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before, Sink != E; ++Sink) { std::string Key = (Label + " " + Sink->getKey().str()).str() + " " + BD.getData().getSuccessorLabel(Sink->getKey()).str(); - EdgesMap.insert({Key, InBefore}); + EdgesMap.insert({Key, BeforeColour}); } } @@ -1677,7 +1670,7 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before, unsigned C = NodePosition.count(Label); if (C == 0) // This only exists in the after IR. Create the node. - createNode(Label, BD, InAfter); + createNode(Label, BD, AfterColour); else { assert(C == 1 && "Unexpected multiple nodes."); Nodes[NodePosition[Label]].setCommon(BD); @@ -1690,9 +1683,9 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before, BD.getData().getSuccessorLabel(Sink->getKey()).str(); unsigned C = EdgesMap.count(Key); if (C == 0) - EdgesMap.insert({Key, InAfter}); + EdgesMap.insert({Key, AfterColour}); else { - EdgesMap[Key] = IsCommon; + EdgesMap[Key] = CommonColour; } } } @@ -1712,18 +1705,18 @@ DotCfgDiff::DotCfgDiff(StringRef Title, const FuncDataT<DCData> &Before, DotCfgDiffNode &SourceNode = Nodes[NodePosition[Source]]; assert(NodePosition.count(Sink) == 1 && "Expected to find node."); unsigned SinkNode = NodePosition[Sink]; - IRChangeDiffType T = E.second; + StringRef Colour = E.second; // Look for an edge from Source to Sink if (EdgeLabels.count(SourceSink) == 0) - EdgeLabels.insert({SourceSink, colourize(Value.str(), T)}); + EdgeLabels.insert({SourceSink, colourize(Value.str(), Colour)}); else { StringRef V = EdgeLabels.find(SourceSink)->getValue(); - std::string NV = colourize(V.str() + " " + Value.str(), T); - T = IsCommon; + std::string NV = colourize(V.str() + " " + Value.str(), Colour); + Colour = CommonColour; EdgeLabels[SourceSink] = NV; } - SourceNode.addEdge(SinkNode, Value, T); + SourceNode.addEdge(SinkNode, Value, Colour); } for (auto &I : Nodes) I.finalize(*this); @@ -1744,7 +1737,7 @@ DotCfgDiffDisplayGraph DotCfgDiff::createDisplayGraph(StringRef Title, for (auto &I : Nodes) { if (I.getIndex() == Entry) EntryIndex = Index; - G.createNode(I.getBodyContent(), I.getType()); + G.createNode(I.getBodyContent(), I.getColour()); NodeMap.insert({I.getIndex(), Index++}); } assert(EntryIndex >= 0 && "Expected entry node index to be set."); @@ -1766,12 +1759,12 @@ void DotCfgDiffNode::createDisplayEdges( for (auto I : Edges) { unsigned SinkNodeIndex = I; - IRChangeDiffType Type = getEdge(SinkNodeIndex).second; + StringRef Colour = getEdgeColour(SinkNodeIndex); const DotCfgDiffNode *SinkNode = &Graph.getNode(SinkNodeIndex); StringRef Label = Graph.getEdgeSourceLabel(getIndex(), SinkNodeIndex); DisplayNode &SinkDisplayNode = DisplayGraph.getNode(SinkNode->getIndex()); - SourceDisplayNode.createEdge(Label, SinkDisplayNode, Type); + SourceDisplayNode.createEdge(Label, SinkDisplayNode, Colour); } SourceDisplayNode.createEdgeMap(); } @@ -1891,12 +1884,7 @@ DCData::DCData(const BasicBlock &B) { } DotCfgChangeReporter::DotCfgChangeReporter(bool Verbose) - : ChangeReporter<IRDataT<DCData>>(Verbose) { - // Set up the colours based on the hidden options. - Colours[InBefore] = BeforeColour; - Colours[InAfter] = AfterColour; - Colours[IsCommon] = CommonColour; -} + : ChangeReporter<IRDataT<DCData>>(Verbose) {} void DotCfgChangeReporter::handleFunctionCompare( StringRef Name, StringRef Prefix, StringRef PassID, StringRef Divider, diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index ab3487ecffe8..34e0c5ebcd58 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -110,6 +110,18 @@ static std::string getInstrProfErrString(instrprof_error Err, case instrprof_error::malformed: OS << "malformed instrumentation profile data"; break; + case instrprof_error::missing_debug_info_for_correlation: + OS << "debug info for correlation is required"; + break; + case instrprof_error::unexpected_debug_info_for_correlation: + OS << "debug info for correlation is not necessary"; + break; + case instrprof_error::unable_to_correlate_profile: + OS << "unable to correlate profile"; + break; + case instrprof_error::unsupported_debug_format: + OS << "unsupported debug info format (only DWARF is supported)"; + break; case instrprof_error::invalid_prof: OS << "invalid profile created. Please file a bug " "at: " BUG_REPORT_URL @@ -533,8 +545,8 @@ Error readPGOFuncNameStrings(StringRef NameStrings, InstrProfSymtab &Symtab) { void InstrProfRecord::accumulateCounts(CountSumOrPercent &Sum) const { uint64_t FuncSum = 0; Sum.NumEntries += Counts.size(); - for (size_t F = 0, E = Counts.size(); F < E; ++F) - FuncSum += Counts[F]; + for (uint64_t Count : Counts) + FuncSum += Count; Sum.CountSum += FuncSum; for (uint32_t VK = IPVK_First; VK <= IPVK_Last; ++VK) { @@ -674,9 +686,9 @@ void InstrProfValueSiteRecord::merge(InstrProfValueSiteRecord &Input, void InstrProfValueSiteRecord::scale(uint64_t N, uint64_t D, function_ref<void(instrprof_error)> Warn) { - for (auto I = ValueData.begin(), IE = ValueData.end(); I != IE; ++I) { + for (InstrProfValueData &I : ValueData) { bool Overflowed; - I->Count = SaturatingMultiply(I->Count, N, &Overflowed) / D; + I.Count = SaturatingMultiply(I.Count, N, &Overflowed) / D; if (Overflowed) Warn(instrprof_error::counter_overflow); } @@ -1175,7 +1187,8 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) { // Create a COMDAT variable INSTR_PROF_RAW_VERSION_VAR to make the runtime // aware this is an ir_level profile so it can set the version flag. GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS, - bool InstrEntryBBEnabled) { + bool InstrEntryBBEnabled, + bool DebugInfoCorrelate) { const StringRef VarName(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)); Type *IntTy64 = Type::getInt64Ty(M.getContext()); uint64_t ProfileVersion = (INSTR_PROF_RAW_VERSION | VARIANT_MASK_IR_PROF); @@ -1183,6 +1196,8 @@ GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS, ProfileVersion |= VARIANT_MASK_CSIR_PROF; if (InstrEntryBBEnabled) ProfileVersion |= VARIANT_MASK_INSTR_ENTRY; + if (DebugInfoCorrelate) + ProfileVersion |= VARIANT_MASK_DBG_CORRELATE; auto IRLevelVersionVariable = new GlobalVariable( M, IntTy64, true, GlobalValue::WeakAnyLinkage, Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName); diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp new file mode 100644 index 000000000000..f9c113027da2 --- /dev/null +++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp @@ -0,0 +1,264 @@ +//===-- InstrProfCorrelator.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ProfileData/InstrProfCorrelator.h" +#include "llvm/Object/MachO.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" + +#define DEBUG_TYPE "correlator" + +using namespace llvm; + +/// Get the __llvm_prf_cnts section. +Expected<object::SectionRef> getCountersSection(const object::ObjectFile &Obj) { + for (auto &Section : Obj.sections()) + if (auto SectionName = Section.getName()) + if (SectionName.get() == INSTR_PROF_CNTS_SECT_NAME) + return Section; + return make_error<InstrProfError>( + instrprof_error::unable_to_correlate_profile); +} + +const char *InstrProfCorrelator::FunctionNameAttributeName = "Function Name"; +const char *InstrProfCorrelator::CFGHashAttributeName = "CFG Hash"; +const char *InstrProfCorrelator::NumCountersAttributeName = "Num Counters"; + +llvm::Expected<std::unique_ptr<InstrProfCorrelator::Context>> +InstrProfCorrelator::Context::get(std::unique_ptr<MemoryBuffer> Buffer, + const object::ObjectFile &Obj) { + auto CountersSection = getCountersSection(Obj); + if (auto Err = CountersSection.takeError()) + return std::move(Err); + auto C = std::make_unique<Context>(); + C->Buffer = std::move(Buffer); + C->CountersSectionStart = CountersSection->getAddress(); + C->CountersSectionEnd = C->CountersSectionStart + CountersSection->getSize(); + C->ShouldSwapBytes = Obj.isLittleEndian() != sys::IsLittleEndianHost; + return Expected<std::unique_ptr<Context>>(std::move(C)); +} + +llvm::Expected<std::unique_ptr<InstrProfCorrelator>> +InstrProfCorrelator::get(StringRef DebugInfoFilename) { + auto DsymObjectsOrErr = + object::MachOObjectFile::findDsymObjectMembers(DebugInfoFilename); + if (auto Err = DsymObjectsOrErr.takeError()) + return std::move(Err); + if (!DsymObjectsOrErr->empty()) { + // TODO: Enable profile correlation when there are multiple objects in a + // dSYM bundle. + if (DsymObjectsOrErr->size() > 1) + return createStringError( + std::error_code(), + "Profile correlation using multiple objects is not yet supported"); + DebugInfoFilename = *DsymObjectsOrErr->begin(); + } + auto BufferOrErr = + errorOrToExpected(MemoryBuffer::getFile(DebugInfoFilename)); + if (auto Err = BufferOrErr.takeError()) + return std::move(Err); + + return get(std::move(*BufferOrErr)); +} + +llvm::Expected<std::unique_ptr<InstrProfCorrelator>> +InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) { + auto BinOrErr = object::createBinary(*Buffer); + if (auto Err = BinOrErr.takeError()) + return std::move(Err); + + if (auto *Obj = dyn_cast<object::ObjectFile>(BinOrErr->get())) { + auto CtxOrErr = Context::get(std::move(Buffer), *Obj); + if (auto Err = CtxOrErr.takeError()) + return std::move(Err); + auto T = Obj->makeTriple(); + if (T.isArch64Bit()) + return InstrProfCorrelatorImpl<uint64_t>::get(std::move(*CtxOrErr), *Obj); + if (T.isArch32Bit()) + return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj); + } + return make_error<InstrProfError>( + instrprof_error::unable_to_correlate_profile); +} + +namespace llvm { + +template <> +InstrProfCorrelatorImpl<uint32_t>::InstrProfCorrelatorImpl( + std::unique_ptr<InstrProfCorrelator::Context> Ctx) + : InstrProfCorrelatorImpl(InstrProfCorrelatorKind::CK_32Bit, + std::move(Ctx)) {} +template <> +InstrProfCorrelatorImpl<uint64_t>::InstrProfCorrelatorImpl( + std::unique_ptr<InstrProfCorrelator::Context> Ctx) + : InstrProfCorrelatorImpl(InstrProfCorrelatorKind::CK_64Bit, + std::move(Ctx)) {} +template <> +bool InstrProfCorrelatorImpl<uint32_t>::classof(const InstrProfCorrelator *C) { + return C->getKind() == InstrProfCorrelatorKind::CK_32Bit; +} +template <> +bool InstrProfCorrelatorImpl<uint64_t>::classof(const InstrProfCorrelator *C) { + return C->getKind() == InstrProfCorrelatorKind::CK_64Bit; +} + +} // end namespace llvm + +template <class IntPtrT> +llvm::Expected<std::unique_ptr<InstrProfCorrelatorImpl<IntPtrT>>> +InstrProfCorrelatorImpl<IntPtrT>::get( + std::unique_ptr<InstrProfCorrelator::Context> Ctx, + const object::ObjectFile &Obj) { + if (Obj.isELF() || Obj.isMachO()) { + auto DICtx = DWARFContext::create(Obj); + return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(std::move(DICtx), + std::move(Ctx)); + } + return make_error<InstrProfError>(instrprof_error::unsupported_debug_format); +} + +template <class IntPtrT> +Error InstrProfCorrelatorImpl<IntPtrT>::correlateProfileData() { + assert(Data.empty() && CompressedNames.empty() && Names.empty()); + correlateProfileDataImpl(); + auto Result = + collectPGOFuncNameStrings(Names, /*doCompression=*/true, CompressedNames); + Names.clear(); + return Result; +} + +template <class IntPtrT> +void InstrProfCorrelatorImpl<IntPtrT>::addProbe(StringRef FunctionName, + uint64_t CFGHash, + IntPtrT CounterOffset, + IntPtrT FunctionPtr, + uint32_t NumCounters) { + Data.push_back({ + maybeSwap<uint64_t>(IndexedInstrProf::ComputeHash(FunctionName)), + maybeSwap<uint64_t>(CFGHash), + // In this mode, CounterPtr actually stores the section relative address + // of the counter. + maybeSwap<IntPtrT>(CounterOffset), + maybeSwap<IntPtrT>(FunctionPtr), + // TODO: Value profiling is not yet supported. + /*ValuesPtr=*/maybeSwap<IntPtrT>(0), + maybeSwap<uint32_t>(NumCounters), + /*NumValueSites=*/{maybeSwap<uint16_t>(0), maybeSwap<uint16_t>(0)}, + }); + Names.push_back(FunctionName.str()); +} + +template <class IntPtrT> +llvm::Optional<uint64_t> +DwarfInstrProfCorrelator<IntPtrT>::getLocation(const DWARFDie &Die) const { + auto Locations = Die.getLocations(dwarf::DW_AT_location); + if (!Locations) { + consumeError(Locations.takeError()); + return {}; + } + auto &DU = *Die.getDwarfUnit(); + for (auto &Location : *Locations) { + auto AddressSize = DU.getAddressByteSize(); + DataExtractor Data(Location.Expr, DICtx->isLittleEndian(), AddressSize); + DWARFExpression Expr(Data, AddressSize); + for (auto &Op : Expr) + if (Op.getCode() == dwarf::DW_OP_addr) + return Op.getRawOperand(0); + } + return {}; +} + +template <class IntPtrT> +bool DwarfInstrProfCorrelator<IntPtrT>::isDIEOfProbe(const DWARFDie &Die) { + const auto &ParentDie = Die.getParent(); + if (!Die.isValid() || !ParentDie.isValid() || Die.isNULL()) + return false; + if (Die.getTag() != dwarf::DW_TAG_variable) + return false; + if (!ParentDie.isSubprogramDIE()) + return false; + if (!Die.hasChildren()) + return false; + if (const char *Name = Die.getName(DINameKind::ShortName)) + return StringRef(Name).startswith(getInstrProfCountersVarPrefix()); + return false; +} + +template <class IntPtrT> +void DwarfInstrProfCorrelator<IntPtrT>::correlateProfileDataImpl() { + auto maybeAddProbe = [&](DWARFDie Die) { + if (!isDIEOfProbe(Die)) + return; + Optional<const char *> FunctionName; + Optional<uint64_t> CFGHash; + Optional<uint64_t> CounterPtr = getLocation(Die); + auto FunctionPtr = + dwarf::toAddress(Die.getParent().find(dwarf::DW_AT_low_pc)); + Optional<uint64_t> NumCounters; + for (const DWARFDie &Child : Die.children()) { + if (Child.getTag() != dwarf::DW_TAG_LLVM_annotation) + continue; + auto AnnotationFormName = Child.find(dwarf::DW_AT_name); + auto AnnotationFormValue = Child.find(dwarf::DW_AT_const_value); + if (!AnnotationFormName || !AnnotationFormValue) + continue; + auto AnnotationNameOrErr = AnnotationFormName->getAsCString(); + if (auto Err = AnnotationNameOrErr.takeError()) { + consumeError(std::move(Err)); + continue; + } + StringRef AnnotationName = *AnnotationNameOrErr; + if (AnnotationName.compare( + InstrProfCorrelator::FunctionNameAttributeName) == 0) { + if (auto EC = + AnnotationFormValue->getAsCString().moveInto(FunctionName)) + consumeError(std::move(EC)); + } else if (AnnotationName.compare( + InstrProfCorrelator::CFGHashAttributeName) == 0) { + CFGHash = AnnotationFormValue->getAsUnsignedConstant(); + } else if (AnnotationName.compare( + InstrProfCorrelator::NumCountersAttributeName) == 0) { + NumCounters = AnnotationFormValue->getAsUnsignedConstant(); + } + } + if (!FunctionName || !CFGHash || !CounterPtr || !NumCounters) { + LLVM_DEBUG(dbgs() << "Incomplete DIE for probe\n\tFunctionName: " + << FunctionName << "\n\tCFGHash: " << CFGHash + << "\n\tCounterPtr: " << CounterPtr + << "\n\tNumCounters: " << NumCounters); + LLVM_DEBUG(Die.dump(dbgs())); + return; + } + uint64_t CountersStart = this->Ctx->CountersSectionStart; + uint64_t CountersEnd = this->Ctx->CountersSectionEnd; + if (*CounterPtr < CountersStart || *CounterPtr >= CountersEnd) { + LLVM_DEBUG( + dbgs() << "CounterPtr out of range for probe\n\tFunction Name: " + << FunctionName << "\n\tExpected: [0x" + << Twine::utohexstr(CountersStart) << ", 0x" + << Twine::utohexstr(CountersEnd) << ")\n\tActual: 0x" + << Twine::utohexstr(*CounterPtr)); + LLVM_DEBUG(Die.dump(dbgs())); + return; + } + if (!FunctionPtr) { + LLVM_DEBUG(dbgs() << "Could not find address of " << *FunctionName + << "\n"); + LLVM_DEBUG(Die.dump(dbgs())); + } + this->addProbe(*FunctionName, *CFGHash, *CounterPtr - CountersStart, + FunctionPtr.getValueOr(0), *NumCounters); + }; + for (auto &CU : DICtx->normal_units()) + for (const auto &Entry : CU->dies()) + maybeAddProbe(DWARFDie(CU.get(), &Entry)); + for (auto &CU : DICtx->dwo_units()) + for (const auto &Entry : CU->dies()) + maybeAddProbe(DWARFDie(CU.get(), &Entry)); +} diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 885c1fe49240..37cdf4dd1fe2 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -52,16 +52,19 @@ static Error initializeReader(InstrProfReader &Reader) { } Expected<std::unique_ptr<InstrProfReader>> -InstrProfReader::create(const Twine &Path) { +InstrProfReader::create(const Twine &Path, + const InstrProfCorrelator *Correlator) { // Set up the buffer to read. auto BufferOrError = setupMemoryBuffer(Path); if (Error E = BufferOrError.takeError()) return std::move(E); - return InstrProfReader::create(std::move(BufferOrError.get())); + return InstrProfReader::create(std::move(BufferOrError.get()), Correlator); } Expected<std::unique_ptr<InstrProfReader>> -InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) { +InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer, + const InstrProfCorrelator *Correlator) { + // Sanity check the buffer. if (uint64_t(Buffer->getBufferSize()) > std::numeric_limits<uint64_t>::max()) return make_error<InstrProfError>(instrprof_error::too_large); @@ -73,9 +76,9 @@ InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) { if (IndexedInstrProfReader::hasFormat(*Buffer)) Result.reset(new IndexedInstrProfReader(std::move(Buffer))); else if (RawInstrProfReader64::hasFormat(*Buffer)) - Result.reset(new RawInstrProfReader64(std::move(Buffer))); + Result.reset(new RawInstrProfReader64(std::move(Buffer), Correlator)); else if (RawInstrProfReader32::hasFormat(*Buffer)) - Result.reset(new RawInstrProfReader32(std::move(Buffer))); + Result.reset(new RawInstrProfReader32(std::move(Buffer), Correlator)); else if (TextInstrProfReader::hasFormat(*Buffer)) Result.reset(new TextInstrProfReader(std::move(Buffer))); else @@ -352,7 +355,7 @@ Error RawInstrProfReader<IntPtrT>::readNextHeader(const char *CurrentPos) { template <class IntPtrT> Error RawInstrProfReader<IntPtrT>::createSymtab(InstrProfSymtab &Symtab) { - if (Error E = Symtab.create(StringRef(NamesStart, NamesSize))) + if (Error E = Symtab.create(StringRef(NamesStart, NamesEnd - NamesStart))) return error(std::move(E)); for (const RawInstrProf::ProfileData<IntPtrT> *I = Data; I != DataEnd; ++I) { const IntPtrT FPtr = swap(I->FunctionPointer); @@ -369,6 +372,10 @@ Error RawInstrProfReader<IntPtrT>::readHeader( Version = swap(Header.Version); if (GET_VERSION(Version) != RawInstrProf::Version) return error(instrprof_error::unsupported_version); + if (useDebugInfoCorrelate() && !Correlator) + return error(instrprof_error::missing_debug_info_for_correlation); + if (!useDebugInfoCorrelate() && Correlator) + return error(instrprof_error::unexpected_debug_info_for_correlation); BinaryIdsSize = swap(Header.BinaryIdsSize); if (BinaryIdsSize % sizeof(uint64_t)) @@ -380,7 +387,7 @@ Error RawInstrProfReader<IntPtrT>::readHeader( auto PaddingBytesBeforeCounters = swap(Header.PaddingBytesBeforeCounters); auto CountersSize = swap(Header.CountersSize); auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters); - NamesSize = swap(Header.NamesSize); + auto NamesSize = swap(Header.NamesSize); ValueKindLast = swap(Header.ValueKindLast); auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>); @@ -398,15 +405,27 @@ Error RawInstrProfReader<IntPtrT>::readHeader( if (Start + ValueDataOffset > DataBuffer->getBufferEnd()) return error(instrprof_error::bad_header); - Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>( - Start + DataOffset); - DataEnd = Data + DataSize; + if (Correlator) { + // These sizes in the raw file are zero because we constructed them in the + // Correlator. + assert(DataSize == 0 && NamesSize == 0); + assert(CountersDelta == 0 && NamesDelta == 0); + Data = Correlator->getDataPointer(); + DataEnd = Data + Correlator->getDataSize(); + NamesStart = Correlator->getCompressedNamesPointer(); + NamesEnd = NamesStart + Correlator->getCompressedNamesSize(); + } else { + Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>( + Start + DataOffset); + DataEnd = Data + DataSize; + NamesStart = Start + NamesOffset; + NamesEnd = NamesStart + NamesSize; + } // Binary ids start just after the header. BinaryIdsStart = reinterpret_cast<const uint8_t *>(&Header) + sizeof(RawInstrProf::Header); CountersStart = reinterpret_cast<const uint64_t *>(Start + CountersOffset); - NamesStart = Start + NamesOffset; ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset); const uint8_t *BufferEnd = (const uint8_t *)DataBuffer->getBufferEnd(); @@ -440,45 +459,50 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts( if (NumCounters == 0) return error(instrprof_error::malformed, "number of counters is zero"); - IntPtrT CounterPtr = Data->CounterPtr; - auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart); - ptrdiff_t MaxNumCounters = NamesStartAsCounter - CountersStart; - - // Check bounds. Note that the counter pointer embedded in the data record - // may itself be corrupt. - if (MaxNumCounters < 0 || NumCounters > (uint32_t)MaxNumCounters) - return error(instrprof_error::malformed, - "counter pointer is out of bounds"); - - // We need to compute the in-buffer counter offset from the in-memory address - // distance. The initial CountersDelta is the in-memory address difference - // start(__llvm_prf_cnts)-start(__llvm_prf_data), so SrcData->CounterPtr - - // CountersDelta computes the offset into the in-buffer counter section. - // - // CountersDelta decreases as we advance to the next data record. - ptrdiff_t CounterOffset = getCounterOffset(CounterPtr); - CountersDelta -= sizeof(*Data); - if (CounterOffset < 0) - return error( - instrprof_error::malformed, - ("counter offset " + Twine(CounterOffset) + " is negative").str()); + ArrayRef<uint64_t> RawCounts; + if (Correlator) { + uint64_t CounterOffset = swap<IntPtrT>(Data->CounterPtr) / sizeof(uint64_t); + RawCounts = + makeArrayRef<uint64_t>(CountersStart + CounterOffset, NumCounters); + } else { + IntPtrT CounterPtr = Data->CounterPtr; + ptrdiff_t CounterOffset = getCounterOffset(CounterPtr); + if (CounterOffset < 0) + return error( + instrprof_error::malformed, + ("counter offset " + Twine(CounterOffset) + " is negative").str()); - if (CounterOffset > MaxNumCounters) - return error(instrprof_error::malformed, - ("counter offset " + Twine(CounterOffset) + - " is greater than the maximum number of counters " + - Twine((uint32_t)MaxNumCounters)) - .str()); + // Check bounds. Note that the counter pointer embedded in the data record + // may itself be corrupt. + auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart); + ptrdiff_t MaxNumCounters = NamesStartAsCounter - CountersStart; + if (MaxNumCounters < 0 || NumCounters > (uint32_t)MaxNumCounters) + return error(instrprof_error::malformed, + "counter pointer is out of bounds"); + // We need to compute the in-buffer counter offset from the in-memory + // address distance. The initial CountersDelta is the in-memory address + // difference start(__llvm_prf_cnts)-start(__llvm_prf_data), so + // SrcData->CounterPtr - CountersDelta computes the offset into the + // in-buffer counter section. + if (CounterOffset > MaxNumCounters) + return error(instrprof_error::malformed, + ("counter offset " + Twine(CounterOffset) + + " is greater than the maximum number of counters " + + Twine((uint32_t)MaxNumCounters)) + .str()); - if (((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters) - return error(instrprof_error::malformed, - ("number of counters " + - Twine(((uint32_t)CounterOffset + NumCounters)) + - " is greater than the maximum number of counters " + - Twine((uint32_t)MaxNumCounters)) - .str()); + if (((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters) + return error(instrprof_error::malformed, + ("number of counters " + + Twine(((uint32_t)CounterOffset + NumCounters)) + + " is greater than the maximum number of counters " + + Twine((uint32_t)MaxNumCounters)) + .str()); + // CountersDelta decreases as we advance to the next data record. + CountersDelta -= sizeof(*Data); - auto RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters); + RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters); + } if (ShouldSwapBytes) { Record.Counts.clear(); @@ -977,11 +1001,10 @@ IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName, if (Err) return std::move(Err); // Found it. Look for counters with the right hash. - for (unsigned I = 0, E = Data.size(); I < E; ++I) { + for (const NamedInstrProfRecord &I : Data) { // Check for a match and fill the vector if there is one. - if (Data[I].Hash == FuncHash) { - return std::move(Data[I]); - } + if (I.Hash == FuncHash) + return std::move(I); } return error(instrprof_error::hash_mismatch); } diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index 492e3541cb5a..6628eea80640 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -32,6 +32,7 @@ #include <vector> using namespace llvm; +extern cl::opt<bool> DebugInfoCorrelate; // A struct to define how the data stream should be patched. For Indexed // profiling, only uint64_t data type is needed. diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp index f54df7b295e3..bbb640cfaee8 100644 --- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp +++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp @@ -194,7 +194,7 @@ SampleProfileSummaryBuilder::computeSummaryForProfiles( // more function profiles each with lower counts, which in turn leads to lower // hot thresholds. To compensate for that, by default we merge context // profiles before computing profile summary. - if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCS && + if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCSFlat && !UseContextLessSummary.getNumOccurrences())) { for (const auto &I : Profiles) { ContextLessProfiles[I.second.getName()].merge(I.second); diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp index fd8fd3b675b7..9b01a386a360 100644 --- a/llvm/lib/ProfileData/SampleProf.cpp +++ b/llvm/lib/ProfileData/SampleProf.cpp @@ -35,11 +35,18 @@ static cl::opt<uint64_t> ProfileSymbolListCutOff( cl::desc("Cutoff value about how many symbols in profile symbol list " "will be used. This is very useful for performance debugging")); +cl::opt<bool> GenerateMergedBaseProfiles( + "generate-merged-base-profiles", cl::init(true), cl::ZeroOrMore, + cl::desc("When generating nested context-sensitive profiles, always " + "generate extra base profile for function with all its context " + "profiles merged into it.")); + namespace llvm { namespace sampleprof { SampleProfileFormat FunctionSamples::Format; bool FunctionSamples::ProfileIsProbeBased = false; -bool FunctionSamples::ProfileIsCS = false; +bool FunctionSamples::ProfileIsCSFlat = false; +bool FunctionSamples::ProfileIsCSNested = false; bool FunctionSamples::UseMD5 = false; bool FunctionSamples::HasUniqSuffix = true; bool FunctionSamples::ProfileIsFS = false; @@ -218,8 +225,9 @@ unsigned FunctionSamples::getOffset(const DILocation *DIL) { 0xffff; } -LineLocation FunctionSamples::getCallSiteIdentifier(const DILocation *DIL) { - if (FunctionSamples::ProfileIsProbeBased) +LineLocation FunctionSamples::getCallSiteIdentifier(const DILocation *DIL, + bool ProfileIsFS) { + if (FunctionSamples::ProfileIsProbeBased) { // In a pseudo-probe based profile, a callsite is simply represented by the // ID of the probe associated with the call instruction. The probe ID is // encoded in the Discriminator field of the call instruction's debug @@ -227,9 +235,19 @@ LineLocation FunctionSamples::getCallSiteIdentifier(const DILocation *DIL) { return LineLocation(PseudoProbeDwarfDiscriminator::extractProbeIndex( DIL->getDiscriminator()), 0); - else - return LineLocation(FunctionSamples::getOffset(DIL), - DIL->getBaseDiscriminator()); + } else { + unsigned Discriminator = + ProfileIsFS ? DIL->getDiscriminator() : DIL->getBaseDiscriminator(); + return LineLocation(FunctionSamples::getOffset(DIL), Discriminator); + } +} + +uint64_t FunctionSamples::getCallSiteHash(StringRef CalleeName, + const LineLocation &Callsite) { + uint64_t NameHash = std::hash<std::string>{}(CalleeName.str()); + uint64_t LocId = + (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator; + return NameHash + (LocId << 5) + LocId; } const FunctionSamples *FunctionSamples::findFunctionSamples( @@ -239,21 +257,16 @@ const FunctionSamples *FunctionSamples::findFunctionSamples( const DILocation *PrevDIL = DIL; for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) { - unsigned Discriminator; - if (ProfileIsFS) - Discriminator = DIL->getDiscriminator(); - else - Discriminator = DIL->getBaseDiscriminator(); - // Use C++ linkage name if possible. StringRef Name = PrevDIL->getScope()->getSubprogram()->getLinkageName(); if (Name.empty()) Name = PrevDIL->getScope()->getSubprogram()->getName(); - - S.push_back( - std::make_pair(LineLocation(getOffset(DIL), Discriminator), Name)); + S.emplace_back(FunctionSamples::getCallSiteIdentifier( + DIL, FunctionSamples::ProfileIsFS), + Name); PrevDIL = DIL; } + if (S.size() == 0) return this; const FunctionSamples *FS = this; @@ -454,3 +467,81 @@ void ProfileSymbolList::dump(raw_ostream &OS) const { for (auto &Sym : SortedList) OS << Sym << "\n"; } + +CSProfileConverter::FrameNode * +CSProfileConverter::FrameNode::getOrCreateChildFrame( + const LineLocation &CallSite, StringRef CalleeName) { + uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite); + auto It = AllChildFrames.find(Hash); + if (It != AllChildFrames.end()) { + assert(It->second.FuncName == CalleeName && + "Hash collision for child context node"); + return &It->second; + } + + AllChildFrames[Hash] = FrameNode(CalleeName, nullptr, CallSite); + return &AllChildFrames[Hash]; +} + +CSProfileConverter::CSProfileConverter(SampleProfileMap &Profiles) + : ProfileMap(Profiles) { + for (auto &FuncSample : Profiles) { + FunctionSamples *FSamples = &FuncSample.second; + auto *NewNode = getOrCreateContextPath(FSamples->getContext()); + assert(!NewNode->FuncSamples && "New node cannot have sample profile"); + NewNode->FuncSamples = FSamples; + } +} + +CSProfileConverter::FrameNode * +CSProfileConverter::getOrCreateContextPath(const SampleContext &Context) { + auto Node = &RootFrame; + LineLocation CallSiteLoc(0, 0); + for (auto &Callsite : Context.getContextFrames()) { + Node = Node->getOrCreateChildFrame(CallSiteLoc, Callsite.FuncName); + CallSiteLoc = Callsite.Location; + } + return Node; +} + +void CSProfileConverter::convertProfiles(CSProfileConverter::FrameNode &Node) { + // Process each child profile. Add each child profile to callsite profile map + // of the current node `Node` if `Node` comes with a profile. Otherwise + // promote the child profile to a standalone profile. + auto *NodeProfile = Node.FuncSamples; + for (auto &It : Node.AllChildFrames) { + auto &ChildNode = It.second; + convertProfiles(ChildNode); + auto *ChildProfile = ChildNode.FuncSamples; + if (!ChildProfile) + continue; + SampleContext OrigChildContext = ChildProfile->getContext(); + // Reset the child context to be contextless. + ChildProfile->getContext().setName(OrigChildContext.getName()); + if (NodeProfile) { + // Add child profile to the callsite profile map. + auto &SamplesMap = NodeProfile->functionSamplesAt(ChildNode.CallSiteLoc); + SamplesMap.emplace(OrigChildContext.getName().str(), *ChildProfile); + NodeProfile->addTotalSamples(ChildProfile->getTotalSamples()); + } + + // Separate child profile to be a standalone profile, if the current parent + // profile doesn't exist. This is a duplicating operation when the child + // profile is already incorporated into the parent which is still useful and + // thus done optionally. It is seen that duplicating context profiles into + // base profiles improves the code quality for thinlto build by allowing a + // profile in the prelink phase for to-be-fully-inlined functions. + if (!NodeProfile || GenerateMergedBaseProfiles) + ProfileMap[ChildProfile->getContext()].merge(*ChildProfile); + + // Contexts coming with a `ContextShouldBeInlined` attribute indicate this + // is a preinliner-computed profile. + if (OrigChildContext.hasAttribute(ContextShouldBeInlined)) + FunctionSamples::ProfileIsCSNested = true; + + // Remove the original child profile. + ProfileMap.erase(OrigChildContext); + } +} + +void CSProfileConverter::convertProfiles() { convertProfiles(RootFrame); } diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index eefb7c2ba627..da16309fb82c 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -146,7 +146,7 @@ static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth, if (Depth == 0) return false; - if (Depth == 1 && Input[Depth] == '!') { + if (Input[Depth] == '!') { LineTy = LineType::Metadata; return parseMetadata(Input.substr(Depth), FunctionHash, Attributes); } @@ -244,11 +244,11 @@ std::error_code SampleProfileReaderText::readImpl() { sampleprof_error Result = sampleprof_error::success; InlineCallStack InlineStack; - uint32_t ProbeProfileCount = 0; + uint32_t TopLevelProbeProfileCount = 0; - // SeenMetadata tracks whether we have processed metadata for the current - // top-level function profile. - bool SeenMetadata = false; + // DepthMetadata tracks whether we have processed metadata for the current + // top-level or nested function profile. + uint32_t DepthMetadata = 0; ProfileIsFS = ProfileIsFSDisciminator; FunctionSamples::ProfileIsFS = ProfileIsFS; @@ -275,7 +275,7 @@ std::error_code SampleProfileReaderText::readImpl() { "Expected 'mangled_name:NUM:NUM', found " + *LineIt); return sampleprof_error::malformed; } - SeenMetadata = false; + DepthMetadata = 0; SampleContext FContext(FName, CSNameTable); if (FContext.hasContext()) ++CSProfileCount; @@ -302,7 +302,7 @@ std::error_code SampleProfileReaderText::readImpl() { *LineIt); return sampleprof_error::malformed; } - if (SeenMetadata && LineTy != LineType::Metadata) { + if (LineTy != LineType::Metadata && Depth == DepthMetadata) { // Metadata must be put at the end of a function profile. reportError(LineIt.line_number(), "Found non-metadata after metadata: " + *LineIt); @@ -322,6 +322,7 @@ std::error_code SampleProfileReaderText::readImpl() { FSamples.setName(FName); MergeResult(Result, FSamples.addTotalSamples(NumSamples)); InlineStack.push_back(&FSamples); + DepthMetadata = 0; break; } case LineType::BodyProfile: { @@ -342,11 +343,13 @@ std::error_code SampleProfileReaderText::readImpl() { FunctionSamples &FProfile = *InlineStack.back(); if (FunctionHash) { FProfile.setFunctionHash(FunctionHash); - ++ProbeProfileCount; + if (Depth == 1) + ++TopLevelProbeProfileCount; } - if (Attributes) - FProfile.getContext().setAllAttributes(Attributes); - SeenMetadata = true; + FProfile.getContext().setAllAttributes(Attributes); + if (Attributes & (uint32_t)ContextShouldBeInlined) + ProfileIsCSNested = true; + DepthMetadata = Depth; break; } } @@ -355,12 +358,14 @@ std::error_code SampleProfileReaderText::readImpl() { assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) && "Cannot have both context-sensitive and regular profile"); - ProfileIsCS = (CSProfileCount > 0); - assert((ProbeProfileCount == 0 || ProbeProfileCount == Profiles.size()) && + ProfileIsCSFlat = (CSProfileCount > 0); + assert((TopLevelProbeProfileCount == 0 || + TopLevelProbeProfileCount == Profiles.size()) && "Cannot have both probe-based profiles and regular profiles"); - ProfileIsProbeBased = (ProbeProfileCount > 0); + ProfileIsProbeBased = (TopLevelProbeProfileCount > 0); FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased; - FunctionSamples::ProfileIsCS = ProfileIsCS; + FunctionSamples::ProfileIsCSFlat = ProfileIsCSFlat; + FunctionSamples::ProfileIsCSNested = ProfileIsCSNested; if (Result == sampleprof_error::success) computeSummary(); @@ -625,7 +630,7 @@ SampleProfileReaderExtBinaryBase::readContextFromTable() { ErrorOr<SampleContext> SampleProfileReaderExtBinaryBase::readSampleContextFromTable() { - if (ProfileIsCS) { + if (ProfileIsCSFlat) { auto FContext(readContextFromTable()); if (std::error_code EC = FContext.getError()) return EC; @@ -649,7 +654,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection( if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial)) Summary->setPartialProfile(true); if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext)) - FunctionSamples::ProfileIsCS = ProfileIsCS = true; + FunctionSamples::ProfileIsCSFlat = ProfileIsCSFlat = true; if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator)) FunctionSamples::ProfileIsFS = ProfileIsFS = true; break; @@ -683,6 +688,9 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection( ProfileIsProbeBased = hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsProbeBased); FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased; + ProfileIsCSNested = + hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsCSNested); + FunctionSamples::ProfileIsCSNested = ProfileIsCSNested; bool HasAttribute = hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute); if (std::error_code EC = readFuncMetadata(HasAttribute)) @@ -770,7 +778,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() { } } - if (ProfileIsCS) { + if (ProfileIsCSFlat) { DenseSet<uint64_t> FuncGuidsToUse; if (useMD5()) { for (auto Name : FuncsToUse) @@ -840,7 +848,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() { } assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) && "Cannot have both context-sensitive and regular profile"); - assert((!CSProfileCount || ProfileIsCS) && + assert((!CSProfileCount || ProfileIsCSFlat) && "Section flag should be consistent with actual profile"); return sampleprof_error::success; } @@ -1078,30 +1086,77 @@ std::error_code SampleProfileReaderExtBinaryBase::readCSNameTableSec() { } std::error_code -SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute) { - while (Data < End) { - auto FContext(readSampleContextFromTable()); - if (std::error_code EC = FContext.getError()) - return EC; - bool ProfileInMap = Profiles.count(*FContext); +SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute, + FunctionSamples *FProfile) { + if (Data < End) { if (ProfileIsProbeBased) { auto Checksum = readNumber<uint64_t>(); if (std::error_code EC = Checksum.getError()) return EC; - if (ProfileInMap) - Profiles[*FContext].setFunctionHash(*Checksum); + if (FProfile) + FProfile->setFunctionHash(*Checksum); } if (ProfileHasAttribute) { auto Attributes = readNumber<uint32_t>(); if (std::error_code EC = Attributes.getError()) return EC; - if (ProfileInMap) - Profiles[*FContext].getContext().setAllAttributes(*Attributes); + if (FProfile) + FProfile->getContext().setAllAttributes(*Attributes); + } + + if (!ProfileIsCSFlat) { + // Read all the attributes for inlined function calls. + auto NumCallsites = readNumber<uint32_t>(); + if (std::error_code EC = NumCallsites.getError()) + return EC; + + for (uint32_t J = 0; J < *NumCallsites; ++J) { + auto LineOffset = readNumber<uint64_t>(); + if (std::error_code EC = LineOffset.getError()) + return EC; + + auto Discriminator = readNumber<uint64_t>(); + if (std::error_code EC = Discriminator.getError()) + return EC; + + auto FContext(readSampleContextFromTable()); + if (std::error_code EC = FContext.getError()) + return EC; + + FunctionSamples *CalleeProfile = nullptr; + if (FProfile) { + CalleeProfile = const_cast<FunctionSamples *>( + &FProfile->functionSamplesAt(LineLocation( + *LineOffset, + *Discriminator))[std::string(FContext.get().getName())]); + } + if (std::error_code EC = + readFuncMetadata(ProfileHasAttribute, CalleeProfile)) + return EC; + } } } + return sampleprof_error::success; +} + +std::error_code +SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute) { + while (Data < End) { + auto FContext(readSampleContextFromTable()); + if (std::error_code EC = FContext.getError()) + return EC; + FunctionSamples *FProfile = nullptr; + auto It = Profiles.find(*FContext); + if (It != Profiles.end()) + FProfile = &It->second; + + if (std::error_code EC = readFuncMetadata(ProfileHasAttribute, FProfile)) + return EC; + } + assert(Data == End && "More data is read than expected"); return sampleprof_error::success; } @@ -1233,6 +1288,8 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) { Flags.append("probe,"); if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute)) Flags.append("attr,"); + if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsCSNested)) + Flags.append("preinlined,"); break; default: break; diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp index 78006aab1541..6f02bd203a9f 100644 --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -172,7 +172,7 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() { return (std::error_code)sampleprof_error::success; }; - if (FunctionSamples::ProfileIsCS) { + if (FunctionSamples::ProfileIsCSFlat) { // Sort the contexts before writing them out. This is to help fast load all // context profiles for a function as well as their callee contexts which // can help profile-guided importing for ThinLTO. @@ -195,17 +195,45 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() { } std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata( + const FunctionSamples &FunctionProfile) { + auto &OS = *OutputStream; + if (std::error_code EC = writeContextIdx(FunctionProfile.getContext())) + return EC; + + if (FunctionSamples::ProfileIsProbeBased) + encodeULEB128(FunctionProfile.getFunctionHash(), OS); + if (FunctionSamples::ProfileIsCSFlat || FunctionSamples::ProfileIsCSNested) { + encodeULEB128(FunctionProfile.getContext().getAllAttributes(), OS); + } + + if (!FunctionSamples::ProfileIsCSFlat) { + // Recursively emit attributes for all callee samples. + uint64_t NumCallsites = 0; + for (const auto &J : FunctionProfile.getCallsiteSamples()) + NumCallsites += J.second.size(); + encodeULEB128(NumCallsites, OS); + for (const auto &J : FunctionProfile.getCallsiteSamples()) { + for (const auto &FS : J.second) { + LineLocation Loc = J.first; + encodeULEB128(Loc.LineOffset, OS); + encodeULEB128(Loc.Discriminator, OS); + if (std::error_code EC = writeFuncMetadata(FS.second)) + return EC; + } + } + } + + return sampleprof_error::success; +} + +std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata( const SampleProfileMap &Profiles) { - if (!FunctionSamples::ProfileIsProbeBased && !FunctionSamples::ProfileIsCS) + if (!FunctionSamples::ProfileIsProbeBased && + !FunctionSamples::ProfileIsCSFlat && !FunctionSamples::ProfileIsCSNested) return sampleprof_error::success; - auto &OS = *OutputStream; for (const auto &Entry : Profiles) { - if (std::error_code EC = writeContextIdx(Entry.second.getContext())) + if (std::error_code EC = writeFuncMetadata(Entry.second)) return EC; - if (FunctionSamples::ProfileIsProbeBased) - encodeULEB128(Entry.second.getFunctionHash(), OS); - if (FunctionSamples::ProfileIsCS) - encodeULEB128(Entry.second.getContext().getAllAttributes(), OS); } return sampleprof_error::success; } @@ -295,10 +323,13 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection( setToCompressSection(SecProfileSymbolList); if (Type == SecFuncMetadata && FunctionSamples::ProfileIsProbeBased) addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased); - if (Type == SecProfSummary && FunctionSamples::ProfileIsCS) - addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext); - if (Type == SecFuncMetadata && FunctionSamples::ProfileIsCS) + if (Type == SecFuncMetadata && FunctionSamples::ProfileIsCSNested) + addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsCSNested); + if (Type == SecFuncMetadata && + (FunctionSamples::ProfileIsCSFlat || FunctionSamples::ProfileIsCSNested)) addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagHasAttribute); + if (Type == SecProfSummary && FunctionSamples::ProfileIsCSFlat) + addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext); if (Type == SecProfSummary && FunctionSamples::ProfileIsFS) addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFSDiscriminator); @@ -440,7 +471,7 @@ SampleProfileWriterCompactBinary::write(const SampleProfileMap &ProfileMap) { /// it needs to be parsed by the SampleProfileReaderText class. std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) { auto &OS = *OutputStream; - if (FunctionSamples::ProfileIsCS) + if (FunctionSamples::ProfileIsCSFlat) OS << "[" << S.getContext().toString() << "]:" << S.getTotalSamples(); else OS << S.getName() << ":" << S.getTotalSamples(); @@ -483,15 +514,14 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) { } Indent -= 1; - if (Indent == 0) { - if (FunctionSamples::ProfileIsProbeBased) { - OS.indent(Indent + 1); - OS << "!CFGChecksum: " << S.getFunctionHash() << "\n"; - } - if (FunctionSamples::ProfileIsCS) { - OS.indent(Indent + 1); - OS << "!Attributes: " << S.getContext().getAllAttributes() << "\n"; - } + if (FunctionSamples::ProfileIsProbeBased) { + OS.indent(Indent + 1); + OS << "!CFGChecksum: " << S.getFunctionHash() << "\n"; + } + + if (S.getContext().getAllAttributes()) { + OS.indent(Indent + 1); + OS << "!Attributes: " << S.getContext().getAllAttributes() << "\n"; } return sampleprof_error::success; @@ -841,7 +871,8 @@ SampleProfileWriter::create(std::unique_ptr<raw_ostream> &OS, std::unique_ptr<SampleProfileWriter> Writer; // Currently only Text and Extended Binary format are supported for CSSPGO. - if ((FunctionSamples::ProfileIsCS || FunctionSamples::ProfileIsProbeBased) && + if ((FunctionSamples::ProfileIsCSFlat || + FunctionSamples::ProfileIsProbeBased) && (Format == SPF_Binary || Format == SPF_Compact_Binary)) return sampleprof_error::unsupported_writing_format; diff --git a/llvm/lib/Support/AArch64TargetParser.cpp b/llvm/lib/Support/AArch64TargetParser.cpp index a3e41ccd199c..4bc9c8487131 100644 --- a/llvm/lib/Support/AArch64TargetParser.cpp +++ b/llvm/lib/Support/AArch64TargetParser.cpp @@ -240,4 +240,4 @@ AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) { return C.ArchID; } return ArchKind::INVALID; -}
\ No newline at end of file +} diff --git a/llvm/lib/Support/Caching.cpp b/llvm/lib/Support/Caching.cpp index a2fe37a26617..8c685640f791 100644 --- a/llvm/lib/Support/Caching.cpp +++ b/llvm/lib/Support/Caching.cpp @@ -79,14 +79,13 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef, struct CacheStream : CachedFileStream { AddBufferFn AddBuffer; sys::fs::TempFile TempFile; - std::string EntryPath; unsigned Task; CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddBufferFn AddBuffer, sys::fs::TempFile TempFile, std::string EntryPath, unsigned Task) - : CachedFileStream(std::move(OS)), AddBuffer(std::move(AddBuffer)), - TempFile(std::move(TempFile)), EntryPath(std::move(EntryPath)), + : CachedFileStream(std::move(OS), std::move(EntryPath)), + AddBuffer(std::move(AddBuffer)), TempFile(std::move(TempFile)), Task(Task) {} ~CacheStream() { @@ -99,7 +98,7 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef, // Open the file first to avoid racing with a cache pruner. ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr = MemoryBuffer::getOpenFile( - sys::fs::convertFDToNativeFile(TempFile.FD), TempFile.TmpName, + sys::fs::convertFDToNativeFile(TempFile.FD), ObjectPathName, /*FileSize=*/-1, /*RequiresNullTerminator=*/false); if (!MBOrErr) report_fatal_error(Twine("Failed to open new cache file ") + @@ -115,14 +114,14 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef, // AddBuffer a copy of the bytes we wrote in that case. We do this // instead of just using the existing file, because the pruner might // delete the file before we get a chance to use it. - Error E = TempFile.keep(EntryPath); + Error E = TempFile.keep(ObjectPathName); E = handleErrors(std::move(E), [&](const ECError &E) -> Error { std::error_code EC = E.convertToErrorCode(); if (EC != errc::permission_denied) return errorCodeToError(EC); auto MBCopy = MemoryBuffer::getMemBufferCopy((*MBOrErr)->getBuffer(), - EntryPath); + ObjectPathName); MBOrErr = std::move(MBCopy); // FIXME: should we consume the discard error? @@ -133,7 +132,7 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef, if (E) report_fatal_error(Twine("Failed to rename temporary file ") + - TempFile.TmpName + " to " + EntryPath + ": " + + TempFile.TmpName + " to " + ObjectPathName + ": " + toString(std::move(E)) + "\n"); AddBuffer(Task, std::move(*MBOrErr)); diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 5b7004c86f5a..4153a69abf5d 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -1538,10 +1538,8 @@ bool CommandLineParser::ParseCommandLineOptions(int argc, ErrorParsing = true; } else { - for (SmallVectorImpl<Option *>::iterator I = SinkOpts.begin(), - E = SinkOpts.end(); - I != E; ++I) - (*I)->addOccurrence(i, "", StringRef(argv[i])); + for (Option *SinkOpt : SinkOpts) + SinkOpt->addOccurrence(i, "", StringRef(argv[i])); } continue; } @@ -2303,11 +2301,8 @@ protected: // Collect registered option categories into vector in preparation for // sorting. - for (auto I = GlobalParser->RegisteredOptionCategories.begin(), - E = GlobalParser->RegisteredOptionCategories.end(); - I != E; ++I) { - SortedCategories.push_back(*I); - } + for (OptionCategory *Category : GlobalParser->RegisteredOptionCategories) + SortedCategories.push_back(Category); // Sort the different option categories alphabetically. assert(SortedCategories.size() > 0 && "No option categories registered!"); @@ -2315,11 +2310,8 @@ protected: OptionCategoryCompare); // Create map to empty vectors. - for (std::vector<OptionCategory *>::const_iterator - I = SortedCategories.begin(), - E = SortedCategories.end(); - I != E; ++I) - CategorizedOptions[*I] = std::vector<Option *>(); + for (OptionCategory *Category : SortedCategories) + CategorizedOptions[Category] = std::vector<Option *>(); // Walk through pre-sorted options and assign into categories. // Because the options are already alphabetically sorted the @@ -2334,23 +2326,20 @@ protected: } // Now do printing. - for (std::vector<OptionCategory *>::const_iterator - Category = SortedCategories.begin(), - E = SortedCategories.end(); - Category != E; ++Category) { + for (OptionCategory *Category : SortedCategories) { // Hide empty categories for --help, but show for --help-hidden. - const auto &CategoryOptions = CategorizedOptions[*Category]; + const auto &CategoryOptions = CategorizedOptions[Category]; bool IsEmptyCategory = CategoryOptions.empty(); if (!ShowHidden && IsEmptyCategory) continue; // Print category information. outs() << "\n"; - outs() << (*Category)->getName() << ":\n"; + outs() << Category->getName() << ":\n"; // Check if description is set. - if (!(*Category)->getDescription().empty()) - outs() << (*Category)->getDescription() << "\n\n"; + if (!Category->getDescription().empty()) + outs() << Category->getDescription() << "\n\n"; else outs() << "\n"; diff --git a/llvm/lib/Support/Compression.cpp b/llvm/lib/Support/Compression.cpp index b8c77cf69b95..ccf6ef4bb662 100644 --- a/llvm/lib/Support/Compression.cpp +++ b/llvm/lib/Support/Compression.cpp @@ -49,14 +49,14 @@ bool zlib::isAvailable() { return true; } Error zlib::compress(StringRef InputBuffer, SmallVectorImpl<char> &CompressedBuffer, int Level) { unsigned long CompressedSize = ::compressBound(InputBuffer.size()); - CompressedBuffer.reserve(CompressedSize); + CompressedBuffer.resize_for_overwrite(CompressedSize); int Res = ::compress2((Bytef *)CompressedBuffer.data(), &CompressedSize, (const Bytef *)InputBuffer.data(), InputBuffer.size(), Level); // Tell MemorySanitizer that zlib output buffer is fully initialized. // This avoids a false report when running LLVM with uninstrumented ZLib. __msan_unpoison(CompressedBuffer.data(), CompressedSize); - CompressedBuffer.set_size(CompressedSize); + CompressedBuffer.truncate(CompressedSize); return Res ? createError(convertZlibCodeToString(Res)) : Error::success(); } @@ -74,10 +74,10 @@ Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer, Error zlib::uncompress(StringRef InputBuffer, SmallVectorImpl<char> &UncompressedBuffer, size_t UncompressedSize) { - UncompressedBuffer.reserve(UncompressedSize); + UncompressedBuffer.resize_for_overwrite(UncompressedSize); Error E = uncompress(InputBuffer, UncompressedBuffer.data(), UncompressedSize); - UncompressedBuffer.set_size(UncompressedSize); + UncompressedBuffer.truncate(UncompressedSize); return E; } diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp index d8d46712a593..392c4c4890e1 100644 --- a/llvm/lib/Support/ConvertUTFWrapper.cpp +++ b/llvm/lib/Support/ConvertUTFWrapper.cpp @@ -103,8 +103,8 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) { std::vector<UTF16> ByteSwapped; if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) { ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd); - for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I) - ByteSwapped[I] = llvm::ByteSwap_16(ByteSwapped[I]); + for (UTF16 &I : ByteSwapped) + I = llvm::ByteSwap_16(I); Src = &ByteSwapped[0]; SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1; } diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp index e5e6301d41cc..a6daee00bd43 100644 --- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp +++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp @@ -180,22 +180,19 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl( DAGDeltaAlgorithm &DDA, const changeset_ty &Changes, const std::vector<edge_ty> &Dependencies) : DDA(DDA) { - for (changeset_ty::const_iterator it = Changes.begin(), - ie = Changes.end(); it != ie; ++it) { - Predecessors.insert(std::make_pair(*it, std::vector<change_ty>())); - Successors.insert(std::make_pair(*it, std::vector<change_ty>())); + for (change_ty Change : Changes) { + Predecessors.insert(std::make_pair(Change, std::vector<change_ty>())); + Successors.insert(std::make_pair(Change, std::vector<change_ty>())); } - for (std::vector<edge_ty>::const_iterator it = Dependencies.begin(), - ie = Dependencies.end(); it != ie; ++it) { - Predecessors[it->second].push_back(it->first); - Successors[it->first].push_back(it->second); + for (const edge_ty &Dep : Dependencies) { + Predecessors[Dep.second].push_back(Dep.first); + Successors[Dep.first].push_back(Dep.second); } // Compute the roots. - for (changeset_ty::const_iterator it = Changes.begin(), - ie = Changes.end(); it != ie; ++it) - if (succ_begin(*it) == succ_end(*it)) - Roots.push_back(*it); + for (change_ty Change : Changes) + if (succ_begin(Change) == succ_end(Change)) + Roots.push_back(Change); // Pre-compute the closure of the successor relation. std::vector<change_ty> Worklist(Roots.begin(), Roots.end()); @@ -213,14 +210,13 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl( } // Invert to form the predecessor closure map. - for (changeset_ty::const_iterator it = Changes.begin(), - ie = Changes.end(); it != ie; ++it) - PredClosure.insert(std::make_pair(*it, std::set<change_ty>())); - for (changeset_ty::const_iterator it = Changes.begin(), - ie = Changes.end(); it != ie; ++it) - for (succ_closure_iterator_ty it2 = succ_closure_begin(*it), - ie2 = succ_closure_end(*it); it2 != ie2; ++it2) - PredClosure[*it2].insert(*it); + for (change_ty Change : Changes) + PredClosure.insert(std::make_pair(Change, std::set<change_ty>())); + for (change_ty Change : Changes) + for (succ_closure_iterator_ty it2 = succ_closure_begin(Change), + ie2 = succ_closure_end(Change); + it2 != ie2; ++it2) + PredClosure[*it2].insert(Change); // Dump useful debug info. LLVM_DEBUG({ @@ -256,13 +252,12 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl( llvm::errs() << "]\n"; llvm::errs() << "Predecessor Closure:\n"; - for (changeset_ty::const_iterator it = Changes.begin(), ie = Changes.end(); - it != ie; ++it) { - llvm::errs() << format(" %-4d: [", *it); - for (pred_closure_iterator_ty it2 = pred_closure_begin(*it), - ie2 = pred_closure_end(*it); + for (change_ty Change : Changes) { + llvm::errs() << format(" %-4d: [", Change); + for (pred_closure_iterator_ty it2 = pred_closure_begin(Change), + ie2 = pred_closure_end(Change); it2 != ie2; ++it2) { - if (it2 != pred_closure_begin(*it)) + if (it2 != pred_closure_begin(Change)) llvm::errs() << ", "; llvm::errs() << *it2; } @@ -270,13 +265,12 @@ DAGDeltaAlgorithmImpl::DAGDeltaAlgorithmImpl( } llvm::errs() << "Successor Closure:\n"; - for (changeset_ty::const_iterator it = Changes.begin(), ie = Changes.end(); - it != ie; ++it) { - llvm::errs() << format(" %-4d: [", *it); - for (succ_closure_iterator_ty it2 = succ_closure_begin(*it), - ie2 = succ_closure_end(*it); + for (change_ty Change : Changes) { + llvm::errs() << format(" %-4d: [", Change); + for (succ_closure_iterator_ty it2 = succ_closure_begin(Change), + ie2 = succ_closure_end(Change); it2 != ie2; ++it2) { - if (it2 != succ_closure_begin(*it)) + if (it2 != succ_closure_begin(Change)) llvm::errs() << ", "; llvm::errs() << *it2; } @@ -291,9 +285,8 @@ bool DAGDeltaAlgorithmImpl::GetTestResult(const changeset_ty &Changes, const changeset_ty &Required) { changeset_ty Extended(Required); Extended.insert(Changes.begin(), Changes.end()); - for (changeset_ty::const_iterator it = Changes.begin(), - ie = Changes.end(); it != ie; ++it) - Extended.insert(pred_closure_begin(*it), pred_closure_end(*it)); + for (change_ty Change : Changes) + Extended.insert(pred_closure_begin(Change), pred_closure_end(Change)); if (FailedTestsCache.count(Extended)) return false; @@ -340,9 +333,8 @@ DAGDeltaAlgorithmImpl::Run() { // Replace the current set with the predecssors of the minimized set of // active changes. CurrentSet.clear(); - for (changeset_ty::const_iterator it = CurrentMinSet.begin(), - ie = CurrentMinSet.end(); it != ie; ++it) - CurrentSet.insert(pred_begin(*it), pred_end(*it)); + for (change_ty CT : CurrentMinSet) + CurrentSet.insert(pred_begin(CT), pred_end(CT)); // FIXME: We could enforce CurrentSet intersect Required == {} here if we // wanted to protect against cyclic graphs. diff --git a/llvm/lib/Support/DeltaAlgorithm.cpp b/llvm/lib/Support/DeltaAlgorithm.cpp index 6aee69f43405..a2017a10ab3f 100644 --- a/llvm/lib/Support/DeltaAlgorithm.cpp +++ b/llvm/lib/Support/DeltaAlgorithm.cpp @@ -57,9 +57,8 @@ DeltaAlgorithm::Delta(const changeset_ty &Changes, // Otherwise, partition the sets if possible; if not we are done. changesetlist_ty SplitSets; - for (changesetlist_ty::const_iterator it = Sets.begin(), - ie = Sets.end(); it != ie; ++it) - Split(*it, SplitSets); + for (const changeset_ty &Set : Sets) + Split(Set, SplitSets); if (SplitSets.size() == Sets.size()) return Changes; diff --git a/llvm/lib/Support/HTTPClient.cpp b/llvm/lib/Support/HTTPClient.cpp deleted file mode 100644 index 68ba56d1fe50..000000000000 --- a/llvm/lib/Support/HTTPClient.cpp +++ /dev/null @@ -1,97 +0,0 @@ -//===-- llvm/Support/HTTPClient.cpp - HTTP client library -------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// -/// This file defines the methods of the HTTPRequest, HTTPClient, and -/// BufferedHTTPResponseHandler classes. -/// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/HTTPClient.h" -#include "llvm/ADT/APInt.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/Errc.h" -#include "llvm/Support/Error.h" -#include "llvm/Support/MemoryBuffer.h" - -using namespace llvm; - -HTTPRequest::HTTPRequest(StringRef Url) { this->Url = Url.str(); } - -bool operator==(const HTTPRequest &A, const HTTPRequest &B) { - return A.Url == B.Url && A.Method == B.Method && - A.FollowRedirects == B.FollowRedirects; -} - -HTTPResponseHandler::~HTTPResponseHandler() = default; - -static inline bool parseContentLengthHeader(StringRef LineRef, - size_t &ContentLength) { - // Content-Length is a mandatory header, and the only one we handle. - return LineRef.consume_front("Content-Length: ") && - to_integer(LineRef.trim(), ContentLength, 10); -} - -Error BufferedHTTPResponseHandler::handleHeaderLine(StringRef HeaderLine) { - if (ResponseBuffer.Body) - return Error::success(); - - size_t ContentLength; - if (parseContentLengthHeader(HeaderLine, ContentLength)) - ResponseBuffer.Body = - WritableMemoryBuffer::getNewUninitMemBuffer(ContentLength); - - return Error::success(); -} - -Error BufferedHTTPResponseHandler::handleBodyChunk(StringRef BodyChunk) { - if (!ResponseBuffer.Body) - return createStringError(errc::io_error, - "Unallocated response buffer. HTTP Body data " - "received before Content-Length header."); - if (Offset + BodyChunk.size() > ResponseBuffer.Body->getBufferSize()) - return createStringError(errc::io_error, - "Content size exceeds buffer size."); - memcpy(ResponseBuffer.Body->getBufferStart() + Offset, BodyChunk.data(), - BodyChunk.size()); - Offset += BodyChunk.size(); - return Error::success(); -} - -Error BufferedHTTPResponseHandler::handleStatusCode(unsigned Code) { - ResponseBuffer.Code = Code; - return Error::success(); -} - -Expected<HTTPResponseBuffer> HTTPClient::perform(const HTTPRequest &Request) { - BufferedHTTPResponseHandler Handler; - if (Error Err = perform(Request, Handler)) - return std::move(Err); - return std::move(Handler.ResponseBuffer); -} - -Expected<HTTPResponseBuffer> HTTPClient::get(StringRef Url) { - HTTPRequest Request(Url); - return perform(Request); -} - -HTTPClient::HTTPClient() = default; - -HTTPClient::~HTTPClient() = default; - -bool HTTPClient::isAvailable() { return false; } - -void HTTPClient::cleanup() {} - -void HTTPClient::setTimeout(std::chrono::milliseconds Timeout) {} - -Error HTTPClient::perform(const HTTPRequest &Request, - HTTPResponseHandler &Handler) { - llvm_unreachable("No HTTP Client implementation available."); -} diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp index 554e3248524c..8e154067abc0 100644 --- a/llvm/lib/Support/KnownBits.cpp +++ b/llvm/lib/Support/KnownBits.cpp @@ -420,11 +420,19 @@ KnownBits KnownBits::mul(const KnownBits &LHS, const KnownBits &RHS, assert((!SelfMultiply || (LHS.One == RHS.One && LHS.Zero == RHS.Zero)) && "Self multiplication knownbits mismatch"); - // Compute a conservative estimate for high known-0 bits. - unsigned LHSLeadZ = LHS.countMinLeadingZeros(); - unsigned RHSLeadZ = RHS.countMinLeadingZeros(); - unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ, BitWidth) - BitWidth; - assert(LeadZ <= BitWidth && "More zeros than bits?"); + // Compute the high known-0 bits by multiplying the unsigned max of each side. + // Conservatively, M active bits * N active bits results in M + N bits in the + // result. But if we know a value is a power-of-2 for example, then this + // computes one more leading zero. + // TODO: This could be generalized to number of sign bits (negative numbers). + APInt UMaxLHS = LHS.getMaxValue(); + APInt UMaxRHS = RHS.getMaxValue(); + + // For leading zeros in the result to be valid, the unsigned max product must + // fit in the bitwidth (it must not overflow). + bool HasOverflow; + APInt UMaxResult = UMaxLHS.umul_ov(UMaxRHS, HasOverflow); + unsigned LeadZ = HasOverflow ? 0 : UMaxResult.countLeadingZeros(); // The result of the bottom bits of an integer multiply can be // inferred by looking at the bottom bits of both operands and diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index bcf13d828a5d..d3fa3c6f065d 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -227,17 +227,20 @@ static ErrorOr<std::unique_ptr<WritableMemoryBuffer>> getMemoryBufferForStream(sys::fs::file_t FD, const Twine &BufferName) { const ssize_t ChunkSize = 4096*4; SmallString<ChunkSize> Buffer; + // Read into Buffer until we hit EOF. + size_t Size = Buffer.size(); for (;;) { - Buffer.reserve(Buffer.size() + ChunkSize); + Buffer.resize_for_overwrite(Size + ChunkSize); Expected<size_t> ReadBytes = sys::fs::readNativeFile( - FD, makeMutableArrayRef(Buffer.end(), ChunkSize)); + FD, makeMutableArrayRef(Buffer.begin() + Size, ChunkSize)); if (!ReadBytes) return errorToErrorCode(ReadBytes.takeError()); if (*ReadBytes == 0) break; - Buffer.set_size(Buffer.size() + *ReadBytes); + Size += *ReadBytes; } + Buffer.truncate(Size); return getMemBufferCopyImpl(Buffer, BufferName); } diff --git a/llvm/lib/Support/NativeFormatting.cpp b/llvm/lib/Support/NativeFormatting.cpp index ae9f03745850..254d18d797b3 100644 --- a/llvm/lib/Support/NativeFormatting.cpp +++ b/llvm/lib/Support/NativeFormatting.cpp @@ -168,7 +168,7 @@ void llvm::write_double(raw_ostream &S, double N, FloatStyle Style, S << "nan"; return; } else if (std::isinf(N)) { - S << "INF"; + S << (std::signbit(N) ? "-INF" : "INF"); return; } diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp index 3957547dfaaa..7c99d088911c 100644 --- a/llvm/lib/Support/Path.cpp +++ b/llvm/lib/Support/Path.cpp @@ -474,7 +474,7 @@ StringRef parent_path(StringRef path, Style style) { void remove_filename(SmallVectorImpl<char> &path, Style style) { size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()), style); if (end_pos != StringRef::npos) - path.set_size(end_pos); + path.truncate(end_pos); } void replace_extension(SmallVectorImpl<char> &path, const Twine &extension, @@ -486,7 +486,7 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension, // Erase existing extension. size_t pos = p.find_last_of('.'); if (pos != StringRef::npos && pos >= filename_pos(p, style)) - path.set_size(pos); + path.truncate(pos); // Append '.' if needed. if (ext.size() > 0 && ext[0] != '.') diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp index 8e984002f90d..e2e4340f44e9 100644 --- a/llvm/lib/Support/RISCVISAInfo.cpp +++ b/llvm/lib/Support/RISCVISAInfo.cpp @@ -61,7 +61,6 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = { {"zbs", RISCVExtensionVersion{1, 0}}, {"zbt", RISCVExtensionVersion{0, 93}}, - {"zvamo", RISCVExtensionVersion{0, 10}}, {"zvlsseg", RISCVExtensionVersion{0, 10}}, {"zfhmin", RISCVExtensionVersion{0, 1}}, @@ -72,6 +71,28 @@ static bool stripExperimentalPrefix(StringRef &Ext) { return Ext.consume_front("experimental-"); } +// This function finds the first character that doesn't belong to a version +// (e.g. zbe0p93 is extension 'zbe' of version '0p93'). So the function will +// consume [0-9]*p[0-9]* starting from the backward. An extension name will not +// end with a digit or the letter 'p', so this function will parse correctly. +// NOTE: This function is NOT able to take empty strings or strings that only +// have version numbers and no extension name. It assumes the extension name +// will be at least more than one character. +static size_t findFirstNonVersionCharacter(const StringRef &Ext) { + if (Ext.size() == 0) + llvm_unreachable("Already guarded by if-statement in ::parseArchString"); + + int Pos = Ext.size() - 1; + while (Pos > 0 && isDigit(Ext[Pos])) + Pos--; + if (Pos > 0 && Ext[Pos] == 'p' && isDigit(Ext[Pos - 1])) { + Pos--; + while (Pos > 0 && isDigit(Ext[Pos])) + Pos--; + } + return Pos; +} + struct FindByName { FindByName(StringRef Ext) : Ext(Ext){}; StringRef Ext; @@ -264,10 +285,6 @@ void RISCVISAInfo::toFeatures( if (ExtName == "zvlsseg") { Features.push_back("+experimental-v"); Features.push_back("+experimental-zvlsseg"); - } else if (ExtName == "zvamo") { - Features.push_back("+experimental-v"); - Features.push_back("+experimental-zvlsseg"); - Features.push_back("+experimental-zvamo"); } else if (isExperimentalExtension(ExtName)) { Features.push_back(StrAlloc("+experimental-" + ExtName)); } else { @@ -390,7 +407,6 @@ RISCVISAInfo::parseFeatures(unsigned XLen, assert(XLen == 32 || XLen == 64); std::unique_ptr<RISCVISAInfo> ISAInfo(new RISCVISAInfo(XLen)); - bool HasE = false; for (auto &Feature : Features) { StringRef ExtName = Feature; bool Experimental = false; @@ -409,29 +425,19 @@ RISCVISAInfo::parseFeatures(unsigned XLen, if (ExtensionInfoIterator == ExtensionInfos.end()) continue; - if (Add) { - if (ExtName == "e") { - if (XLen != 32) - return createStringError( - errc::invalid_argument, - "standard user-level extension 'e' requires 'rv32'"); - HasE = true; - } - + if (Add) ISAInfo->addExtension(ExtName, ExtensionInfoIterator->Version.Major, ExtensionInfoIterator->Version.Minor); - } else - ISAInfo->Exts.erase(ExtName.str()); - } - if (!HasE) { - if (auto Version = findDefaultVersion("i")) - ISAInfo->addExtension("i", Version->Major, Version->Minor); else - llvm_unreachable("Default extension version for 'i' not found?"); + ISAInfo->Exts.erase(ExtName.str()); } + ISAInfo->updateImplication(); ISAInfo->updateFLen(); + if (Error Result = ISAInfo->checkDependency()) + return std::move(Result); + return std::move(ISAInfo); } @@ -457,7 +463,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension, // The canonical order specified in ISA manual. // Ref: Table 22.1 in RISC-V User-Level ISA V2.2 StringRef StdExts = AllStdExts; - bool HasF = false, HasD = false; char Baseline = Arch[4]; // First letter should be 'e', 'i' or 'g'. @@ -478,8 +483,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension, case 'g': // g = imafd StdExts = StdExts.drop_front(4); - HasF = true; - HasD = true; break; } @@ -560,34 +563,14 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension, // The order is OK, then push it into features. // TODO: Use version number when setting target features - switch (C) { - default: - // Currently LLVM supports only "mafdcbv". + // Currently LLVM supports only "mafdcbv". + StringRef SupportedStandardExtension = "mafdcbv"; + if (!SupportedStandardExtension.contains(C)) return createStringError(errc::invalid_argument, "unsupported standard user-level extension '%c'", C); - case 'm': - ISAInfo->addExtension("m", Major, Minor); - break; - case 'a': - ISAInfo->addExtension("a", Major, Minor); - break; - case 'f': - ISAInfo->addExtension("f", Major, Minor); - HasF = true; - break; - case 'd': - ISAInfo->addExtension("d", Major, Minor); - HasD = true; - break; - case 'c': - ISAInfo->addExtension("c", Major, Minor); - break; - case 'v': - ISAInfo->addExtension("v", Major, Minor); - ISAInfo->addExtension("zvlsseg", Major, Minor); - break; - } + ISAInfo->addExtension(std::string(1, C), Major, Minor); + // Consume full extension name and version, including any optional '_' // between this extension and the next ++I; @@ -595,21 +578,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension, if (*I == '_') ++I; } - // Dependency check. - // It's illegal to specify the 'd' (double-precision floating point) - // extension without also specifying the 'f' (single precision - // floating-point) extension. - // TODO: This has been removed in later specs, which specify that D implies F - if (HasD && !HasF) - return createStringError(errc::invalid_argument, - "d requires f extension to also be specified"); - - // Additional dependency checks. - // TODO: The 'q' extension requires rv64. - // TODO: It is illegal to specify 'e' extensions with 'f' and 'd'. - - if (OtherExts.empty()) - return std::move(ISAInfo); // Handle other types of extensions other than the standard // general purpose and standard user-level extensions. @@ -630,52 +598,53 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension, std::array<StringRef, 4> Prefix{"z", "x", "s", "sx"}; auto I = Prefix.begin(); auto E = Prefix.end(); + if (Split.size() > 1 || Split[0] != "") { + for (StringRef Ext : Split) { + if (Ext.empty()) + return createStringError(errc::invalid_argument, + "extension name missing after separator '_'"); - for (StringRef Ext : Split) { - if (Ext.empty()) - return createStringError(errc::invalid_argument, - "extension name missing after separator '_'"); + StringRef Type = getExtensionType(Ext); + StringRef Desc = getExtensionTypeDesc(Ext); + auto Pos = findFirstNonVersionCharacter(Ext) + 1; + StringRef Name(Ext.substr(0, Pos)); + StringRef Vers(Ext.substr(Pos)); - StringRef Type = getExtensionType(Ext); - StringRef Desc = getExtensionTypeDesc(Ext); - auto Pos = Ext.find_if(isDigit); - StringRef Name(Ext.substr(0, Pos)); - StringRef Vers(Ext.substr(Pos)); + if (Type.empty()) + return createStringError(errc::invalid_argument, + "invalid extension prefix '" + Ext + "'"); - if (Type.empty()) - return createStringError(errc::invalid_argument, - "invalid extension prefix '" + Ext + "'"); + // Check ISA extensions are specified in the canonical order. + while (I != E && *I != Type) + ++I; - // Check ISA extensions are specified in the canonical order. - while (I != E && *I != Type) - ++I; + if (I == E) + return createStringError(errc::invalid_argument, + "%s not given in canonical order '%s'", + Desc.str().c_str(), Ext.str().c_str()); - if (I == E) - return createStringError(errc::invalid_argument, - "%s not given in canonical order '%s'", - Desc.str().c_str(), Ext.str().c_str()); - - if (Name.size() == Type.size()) { - return createStringError(errc::invalid_argument, - "%s name missing after '%s'", Desc.str().c_str(), - Type.str().c_str()); - } + if (Name.size() == Type.size()) { + return createStringError(errc::invalid_argument, + "%s name missing after '%s'", + Desc.str().c_str(), Type.str().c_str()); + } - unsigned Major, Minor, ConsumeLength; - if (auto E = getExtensionVersion(Name, Vers, Major, Minor, ConsumeLength, - EnableExperimentalExtension, - ExperimentalExtensionVersionCheck)) - return std::move(E); + unsigned Major, Minor, ConsumeLength; + if (auto E = getExtensionVersion(Name, Vers, Major, Minor, ConsumeLength, + EnableExperimentalExtension, + ExperimentalExtensionVersionCheck)) + return std::move(E); - // Check if duplicated extension. - if (llvm::is_contained(AllExts, Name)) - return createStringError(errc::invalid_argument, "duplicated %s '%s'", - Desc.str().c_str(), Name.str().c_str()); + // Check if duplicated extension. + if (llvm::is_contained(AllExts, Name)) + return createStringError(errc::invalid_argument, "duplicated %s '%s'", + Desc.str().c_str(), Name.str().c_str()); - ISAInfo->addExtension(Name, Major, Minor); - // Extension format is correct, keep parsing the extensions. - // TODO: Save Type, Name, Major, Minor to avoid parsing them later. - AllExts.push_back(Name); + ISAInfo->addExtension(Name, Major, Minor); + // Extension format is correct, keep parsing the extensions. + // TODO: Save Type, Name, Major, Minor to avoid parsing them later. + AllExts.push_back(Name); + } } for (auto Ext : AllExts) { @@ -686,11 +655,83 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension, } } + ISAInfo->updateImplication(); ISAInfo->updateFLen(); + if (Error Result = ISAInfo->checkDependency()) + return std::move(Result); + return std::move(ISAInfo); } +Error RISCVISAInfo::checkDependency() { + bool IsRv32 = XLen == 32; + bool HasE = Exts.count("e") == 1; + bool HasD = Exts.count("d") == 1; + bool HasF = Exts.count("f") == 1; + + if (HasE && !IsRv32) + return createStringError( + errc::invalid_argument, + "standard user-level extension 'e' requires 'rv32'"); + + // It's illegal to specify the 'd' (double-precision floating point) + // extension without also specifying the 'f' (single precision + // floating-point) extension. + // TODO: This has been removed in later specs, which specify that D implies F + if (HasD && !HasF) + return createStringError(errc::invalid_argument, + "d requires f extension to also be specified"); + + // Additional dependency checks. + // TODO: The 'q' extension requires rv64. + // TODO: It is illegal to specify 'e' extensions with 'f' and 'd'. + + return Error::success(); +} + +static const char *ImpliedExtsV[] = {"zvlsseg"}; +static const char *ImpliedExtsZfh[] = {"zfhmin"}; + +struct ImpliedExtsEntry { + StringLiteral Name; + ArrayRef<const char *> Exts; + + bool operator<(const ImpliedExtsEntry &Other) const { + return Name < Other.Name; + } + + bool operator<(StringRef Other) const { return Name < Other; } +}; + +static constexpr ImpliedExtsEntry ImpliedExts[] = { + {{"v"}, {ImpliedExtsV}}, + {{"zfh"}, {ImpliedExtsZfh}}, +}; + +void RISCVISAInfo::updateImplication() { + bool HasE = Exts.count("e") == 1; + bool HasI = Exts.count("i") == 1; + + // If not in e extension and i extension does not exist, i extension is + // implied + if (!HasE && !HasI) { + auto Version = findDefaultVersion("i"); + addExtension("i", Version->Major, Version->Minor); + } + + assert(llvm::is_sorted(ImpliedExts) && "Table not sorted by Name"); + for (auto &Ext : Exts) { + auto I = llvm::lower_bound(ImpliedExts, Ext.first); + if (I != std::end(ImpliedExts) && I->Name == Ext.first) { + for (auto &ImpliedExt : I->Exts) { + auto Version = findDefaultVersion(ImpliedExt); + addExtension(ImpliedExt, Version->Major, Version->Minor); + } + } + } +} + void RISCVISAInfo::updateFLen() { FLen = 0; // TODO: Handle q extension. diff --git a/llvm/lib/Support/ScopedPrinter.cpp b/llvm/lib/Support/ScopedPrinter.cpp index 779c6c45257d..ea90a24eaced 100644 --- a/llvm/lib/Support/ScopedPrinter.cpp +++ b/llvm/lib/Support/ScopedPrinter.cpp @@ -43,4 +43,14 @@ void ScopedPrinter::printBinaryImpl(StringRef Label, StringRef Str, } } +JSONScopedPrinter::JSONScopedPrinter( + raw_ostream &OS, bool PrettyPrint, + std::unique_ptr<DelimitedScope> &&OuterScope) + : ScopedPrinter(OS, ScopedPrinter::ScopedPrinterKind::JSON), + JOS(OS, /*Indent=*/PrettyPrint ? 2 : 0), + OuterScope(std::move(OuterScope)) { + if (this->OuterScope) + this->OuterScope->setPrinter(*this); +} + } // namespace llvm diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp index dd4dded4cd1d..c018dc92bf40 100644 --- a/llvm/lib/Support/Signals.cpp +++ b/llvm/lib/Support/Signals.cpp @@ -87,8 +87,7 @@ static CallbackAndCookie CallBacksToRun[MaxSignalHandlerCallbacks]; // Signal-safe. void sys::RunSignalHandlers() { - for (size_t I = 0; I < MaxSignalHandlerCallbacks; ++I) { - auto &RunMe = CallBacksToRun[I]; + for (CallbackAndCookie &RunMe : CallBacksToRun) { auto Expected = CallbackAndCookie::Status::Initialized; auto Desired = CallbackAndCookie::Status::Executing; if (!RunMe.Flag.compare_exchange_strong(Expected, Desired)) @@ -103,8 +102,7 @@ void sys::RunSignalHandlers() { // Signal-safe. static void insertSignalHandler(sys::SignalHandlerCallback FnPtr, void *Cookie) { - for (size_t I = 0; I < MaxSignalHandlerCallbacks; ++I) { - auto &SetMe = CallBacksToRun[I]; + for (CallbackAndCookie &SetMe : CallBacksToRun) { auto Expected = CallbackAndCookie::Status::Empty; auto Desired = CallbackAndCookie::Status::Initializing; if (!SetMe.Flag.compare_exchange_strong(Expected, Desired)) diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp index 89b7dc939dfc..2eb2989b200b 100644 --- a/llvm/lib/Support/SourceMgr.cpp +++ b/llvm/lib/Support/SourceMgr.cpp @@ -292,8 +292,7 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind, // Convert any ranges to column ranges that only intersect the line of the // location. - for (unsigned i = 0, e = Ranges.size(); i != e; ++i) { - SMRange R = Ranges[i]; + for (SMRange R : Ranges) { if (!R.isValid()) continue; diff --git a/llvm/lib/Support/Statistic.cpp b/llvm/lib/Support/Statistic.cpp index d95c8642c16e..95ee885d2f8f 100644 --- a/llvm/lib/Support/Statistic.cpp +++ b/llvm/lib/Support/Statistic.cpp @@ -177,11 +177,10 @@ void llvm::PrintStatistics(raw_ostream &OS) { // Figure out how long the biggest Value and Name fields are. unsigned MaxDebugTypeLen = 0, MaxValLen = 0; - for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i) { - MaxValLen = std::max(MaxValLen, - (unsigned)utostr(Stats.Stats[i]->getValue()).size()); - MaxDebugTypeLen = std::max(MaxDebugTypeLen, - (unsigned)std::strlen(Stats.Stats[i]->getDebugType())); + for (TrackingStatistic *Stat : Stats.Stats) { + MaxValLen = std::max(MaxValLen, (unsigned)utostr(Stat->getValue()).size()); + MaxDebugTypeLen = + std::max(MaxDebugTypeLen, (unsigned)std::strlen(Stat->getDebugType())); } Stats.sort(); @@ -192,11 +191,9 @@ void llvm::PrintStatistics(raw_ostream &OS) { << "===" << std::string(73, '-') << "===\n\n"; // Print all of the statistics. - for (size_t i = 0, e = Stats.Stats.size(); i != e; ++i) - OS << format("%*u %-*s - %s\n", - MaxValLen, Stats.Stats[i]->getValue(), - MaxDebugTypeLen, Stats.Stats[i]->getDebugType(), - Stats.Stats[i]->getDesc()); + for (TrackingStatistic *Stat : Stats.Stats) + OS << format("%*u %-*s - %s\n", MaxValLen, Stat->getValue(), + MaxDebugTypeLen, Stat->getDebugType(), Stat->getDesc()); OS << '\n'; // Flush the output stream. OS.flush(); diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp index 4acc23dd455b..bc60bdea5f62 100644 --- a/llvm/lib/Support/TargetParser.cpp +++ b/llvm/lib/Support/TargetParser.cpp @@ -331,6 +331,21 @@ bool getCPUFeaturesExceptStdExt(CPUKind Kind, return true; } +StringRef computeDefaultABIFromArch(const llvm::RISCVISAInfo &ISAInfo) { + if (ISAInfo.getXLen() == 32) { + if (ISAInfo.hasExtension("d")) + return "ilp32d"; + if (ISAInfo.hasExtension("e")) + return "ilp32e"; + return "ilp32"; + } else if (ISAInfo.getXLen() == 64) { + if (ISAInfo.hasExtension("d")) + return "lp64d"; + return "lp64"; + } + llvm_unreachable("Invalid XLEN"); +} + } // namespace RISCV } // namespace llvm diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp index c11e16d3cf98..54ea84d4bd6d 100644 --- a/llvm/lib/Support/ThreadPool.cpp +++ b/llvm/lib/Support/ThreadPool.cpp @@ -21,13 +21,17 @@ using namespace llvm; #if LLVM_ENABLE_THREADS ThreadPool::ThreadPool(ThreadPoolStrategy S) - : ThreadCount(S.compute_thread_count()) { - // Create ThreadCount threads that will loop forever, wait on QueueCondition - // for tasks to be queued or the Pool to be destroyed. - Threads.reserve(ThreadCount); - for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) { - Threads.emplace_back([S, ThreadID, this] { - S.apply_thread_strategy(ThreadID); + : Strategy(S), MaxThreadCount(S.compute_thread_count()) {} + +void ThreadPool::grow(int requested) { + std::unique_lock<std::mutex> LockGuard(ThreadsLock); + if (Threads.size() >= MaxThreadCount) + return; // Already hit the max thread pool size. + int newThreadCount = std::min<int>(requested, MaxThreadCount); + while (static_cast<int>(Threads.size()) < newThreadCount) { + int ThreadID = Threads.size(); + Threads.emplace_back([this, ThreadID] { + Strategy.apply_thread_strategy(ThreadID); while (true) { std::function<void()> Task; { @@ -73,6 +77,7 @@ void ThreadPool::wait() { } bool ThreadPool::isWorkerThread() const { + std::unique_lock<std::mutex> LockGuard(ThreadsLock); llvm::thread::id CurrentThreadId = llvm::this_thread::get_id(); for (const llvm::thread &Thread : Threads) if (CurrentThreadId == Thread.get_id()) @@ -87,6 +92,7 @@ ThreadPool::~ThreadPool() { EnableFlag = false; } QueueCondition.notify_all(); + std::unique_lock<std::mutex> LockGuard(ThreadsLock); for (auto &Worker : Threads) Worker.join(); } @@ -94,8 +100,8 @@ ThreadPool::~ThreadPool() { #else // LLVM_ENABLE_THREADS Disabled // No threads are launched, issue a warning if ThreadCount is not 0 -ThreadPool::ThreadPool(ThreadPoolStrategy S) - : ThreadCount(S.compute_thread_count()) { +ThreadPool::ThreadPool(ThreadPoolStrategy S) : MaxThreadCount(1) { + int ThreadCount = S.compute_thread_count(); if (ThreadCount != 1) { errs() << "Warning: request a ThreadPool with " << ThreadCount << " threads, but LLVM_ENABLE_THREADS has been turned off\n"; diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp index b9a92e280576..2819dc0c139a 100644 --- a/llvm/lib/Support/Triple.cpp +++ b/llvm/lib/Support/Triple.cpp @@ -989,10 +989,9 @@ std::string Triple::normalize(StringRef Str) { } // Replace empty components with "unknown" value. - for (unsigned i = 0, e = Components.size(); i < e; ++i) { - if (Components[i].empty()) - Components[i] = "unknown"; - } + for (StringRef &C : Components) + if (C.empty()) + C = "unknown"; // Special case logic goes here. At this point Arch, Vendor and OS have the // correct values for the computed components. @@ -1091,53 +1090,22 @@ StringRef Triple::getOSAndEnvironmentName() const { return Tmp.split('-').second; // Strip second component } -static unsigned EatNumber(StringRef &Str) { - assert(!Str.empty() && isDigit(Str[0]) && "Not a number"); - unsigned Result = 0; - - do { - // Consume the leading digit. - Result = Result*10 + (Str[0] - '0'); - - // Eat the digit. - Str = Str.substr(1); - } while (!Str.empty() && isDigit(Str[0])); - - return Result; -} - -static void parseVersionFromName(StringRef Name, unsigned &Major, - unsigned &Minor, unsigned &Micro) { - // Any unset version defaults to 0. - Major = Minor = Micro = 0; - - // Parse up to three components. - unsigned *Components[3] = {&Major, &Minor, &Micro}; - for (unsigned i = 0; i != 3; ++i) { - if (Name.empty() || Name[0] < '0' || Name[0] > '9') - break; - - // Consume the leading number. - *Components[i] = EatNumber(Name); - - // Consume the separator, if present. - if (Name.startswith(".")) - Name = Name.substr(1); - } +static VersionTuple parseVersionFromName(StringRef Name) { + VersionTuple Version; + Version.tryParse(Name); + return Version.withoutBuild(); } -void Triple::getEnvironmentVersion(unsigned &Major, unsigned &Minor, - unsigned &Micro) const { +VersionTuple Triple::getEnvironmentVersion() const { StringRef EnvironmentName = getEnvironmentName(); StringRef EnvironmentTypeName = getEnvironmentTypeName(getEnvironment()); if (EnvironmentName.startswith(EnvironmentTypeName)) EnvironmentName = EnvironmentName.substr(EnvironmentTypeName.size()); - parseVersionFromName(EnvironmentName, Major, Minor, Micro); + return parseVersionFromName(EnvironmentName); } -void Triple::getOSVersion(unsigned &Major, unsigned &Minor, - unsigned &Micro) const { +VersionTuple Triple::getOSVersion() const { StringRef OSName = getOSName(); // Assume that the OS portion of the triple starts with the canonical name. StringRef OSTypeName = getOSTypeName(getOS()); @@ -1146,40 +1114,36 @@ void Triple::getOSVersion(unsigned &Major, unsigned &Minor, else if (getOS() == MacOSX) OSName.consume_front("macos"); - parseVersionFromName(OSName, Major, Minor, Micro); + return parseVersionFromName(OSName); } -bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor, - unsigned &Micro) const { - getOSVersion(Major, Minor, Micro); +bool Triple::getMacOSXVersion(VersionTuple &Version) const { + Version = getOSVersion(); switch (getOS()) { default: llvm_unreachable("unexpected OS for Darwin triple"); case Darwin: // Default to darwin8, i.e., MacOSX 10.4. - if (Major == 0) - Major = 8; + if (Version.getMajor() == 0) + Version = VersionTuple(8); // Darwin version numbers are skewed from OS X versions. - if (Major < 4) + if (Version.getMajor() < 4) { return false; - if (Major <= 19) { - Micro = 0; - Minor = Major - 4; - Major = 10; + } + if (Version.getMajor() <= 19) { + Version = VersionTuple(10, Version.getMajor() - 4); } else { - Micro = 0; - Minor = 0; // darwin20+ corresponds to macOS 11+. - Major = 11 + Major - 20; + Version = VersionTuple(11 + Version.getMajor() - 20); } break; case MacOSX: // Default to 10.4. - if (Major == 0) { - Major = 10; - Minor = 4; - } else if (Major < 10) + if (Version.getMajor() == 0) { + Version = VersionTuple(10, 4); + } else if (Version.getMajor() < 10) { return false; + } break; case IOS: case TvOS: @@ -1188,16 +1152,13 @@ bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor, // the clang driver combines OS X and IOS support into a common Darwin // toolchain that wants to know the OS X version number even when targeting // IOS. - Major = 10; - Minor = 4; - Micro = 0; + Version = VersionTuple(10, 4); break; } return true; } -void Triple::getiOSVersion(unsigned &Major, unsigned &Minor, - unsigned &Micro) const { +VersionTuple Triple::getiOSVersion() const { switch (getOS()) { default: llvm_unreachable("unexpected OS for Darwin triple"); case Darwin: @@ -1206,24 +1167,21 @@ void Triple::getiOSVersion(unsigned &Major, unsigned &Minor, // the clang driver combines OS X and IOS support into a common Darwin // toolchain that wants to know the iOS version number even when targeting // OS X. - Major = 5; - Minor = 0; - Micro = 0; - break; + return VersionTuple(5); case IOS: - case TvOS: - getOSVersion(Major, Minor, Micro); + case TvOS: { + VersionTuple Version = getOSVersion(); // Default to 5.0 (or 7.0 for arm64). - if (Major == 0) - Major = (getArch() == aarch64) ? 7 : 5; - break; + if (Version.getMajor() == 0) + return (getArch() == aarch64) ? VersionTuple(7) : VersionTuple(5); + return Version; + } case WatchOS: llvm_unreachable("conflicting triple info"); } } -void Triple::getWatchOSVersion(unsigned &Major, unsigned &Minor, - unsigned &Micro) const { +VersionTuple Triple::getWatchOSVersion() const { switch (getOS()) { default: llvm_unreachable("unexpected OS for Darwin triple"); case Darwin: @@ -1232,15 +1190,13 @@ void Triple::getWatchOSVersion(unsigned &Major, unsigned &Minor, // the clang driver combines OS X and IOS support into a common Darwin // toolchain that wants to know the iOS version number even when targeting // OS X. - Major = 2; - Minor = 0; - Micro = 0; - break; - case WatchOS: - getOSVersion(Major, Minor, Micro); - if (Major == 0) - Major = 2; - break; + return VersionTuple(2); + case WatchOS: { + VersionTuple Version = getOSVersion(); + if (Version.getMajor() == 0) + return VersionTuple(2); + return Version; + } case IOS: llvm_unreachable("conflicting triple info"); } diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc index 19d89db55627..f5cb5895d95d 100644 --- a/llvm/lib/Support/Unix/Path.inc +++ b/llvm/lib/Support/Unix/Path.inc @@ -590,19 +590,6 @@ std::error_code rename(const Twine &from, const Twine &to) { } std::error_code resize_file(int FD, uint64_t Size) { -#if defined(HAVE_POSIX_FALLOCATE) - // If we have posix_fallocate use it. Unlike ftruncate it always allocates - // space, so we get an error if the disk is full. - if (int Err = ::posix_fallocate(FD, 0, Size)) { -#ifdef _AIX - constexpr int NotSupportedError = ENOTSUP; -#else - constexpr int NotSupportedError = EOPNOTSUPP; -#endif - if (Err != EINVAL && Err != NotSupportedError) - return std::error_code(Err, std::generic_category()); - } -#endif // Use ftruncate as a fallback. It may or may not allocate space. At least on // OS X with HFS+ it does. if (::ftruncate(FD, Size) == -1) diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 9bf0384b5f1b..bec4e8dbe06c 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -75,6 +75,12 @@ Status::Status(const Twine &Name, UniqueID UID, sys::TimePoint<> MTime, : Name(Name.str()), UID(UID), MTime(MTime), User(User), Group(Group), Size(Size), Type(Type), Perms(Perms) {} +Status Status::copyWithNewSize(const Status &In, uint64_t NewSize) { + return Status(In.getName(), In.getUniqueID(), In.getLastModificationTime(), + In.getUser(), In.getGroup(), NewSize, In.getType(), + In.getPermissions()); +} + Status Status::copyWithNewName(const Status &In, const Twine &NewName) { return Status(NewName, In.getUniqueID(), In.getLastModificationTime(), In.getUser(), In.getGroup(), In.getSize(), In.getType(), diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp index f68ba0d065c1..2adf37a511d1 100644 --- a/llvm/lib/Support/YAMLParser.cpp +++ b/llvm/lib/Support/YAMLParser.cpp @@ -1876,8 +1876,8 @@ document_iterator Stream::end() { } void Stream::skip() { - for (document_iterator i = begin(), e = end(); i != e; ++i) - i->skip(); + for (Document &Doc : *this) + Doc.skip(); } Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, diff --git a/llvm/lib/TableGen/StringMatcher.cpp b/llvm/lib/TableGen/StringMatcher.cpp index 7f30c7b60752..7474c5dfe885 100644 --- a/llvm/lib/TableGen/StringMatcher.cpp +++ b/llvm/lib/TableGen/StringMatcher.cpp @@ -32,8 +32,8 @@ FindFirstNonCommonLetter(const std::vector<const // Check to see if letter i is the same across the set. char Letter = Matches[0]->first[i]; - for (unsigned str = 0, e = Matches.size(); str != e; ++str) - if (Matches[str]->first[i] != Letter) + for (const StringMatcher::StringPair *Match : Matches) + if (Match->first[i] != Letter) return i; } @@ -75,9 +75,8 @@ bool StringMatcher::EmitStringMatcherForChar( // Bucket the matches by the character we are comparing. std::map<char, std::vector<const StringPair*>> MatchesByLetter; - for (unsigned i = 0, e = Matches.size(); i != e; ++i) - MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]); - + for (const StringPair *Match : Matches) + MatchesByLetter[Match->first[CharNo]].push_back(Match); // If we have exactly one bucket to match, see how many characters are common // across the whole set and match all of them at once. @@ -135,8 +134,8 @@ void StringMatcher::Emit(unsigned Indent, bool IgnoreDuplicates) const { // First level categorization: group strings by length. std::map<unsigned, std::vector<const StringPair*>> MatchesByLength; - for (unsigned i = 0, e = Matches.size(); i != e; ++i) - MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]); + for (const StringPair &Match : Matches) + MatchesByLength[Match.first.size()].push_back(&Match); // Output a switch statement on length and categorize the elements within each // bin. diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 548e4e0c9389..cb17fd94c335 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -455,6 +455,9 @@ def FeatureEL2VMSA : SubtargetFeature<"el2vmsa", "HasEL2VMSA", "true", def FeatureEL3 : SubtargetFeature<"el3", "HasEL3", "true", "Enable Exception Level 3">; +def FeatureFixCortexA53_835769 : SubtargetFeature<"fix-cortex-a53-835769", + "FixCortexA53_835769", "true", "Mitigate Cortex-A53 Erratum 835769">; + //===----------------------------------------------------------------------===// // Architectures. // diff --git a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp index 7fd51a98ad94..4cdf5f144437 100644 --- a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -116,8 +117,13 @@ INITIALIZE_PASS(AArch64A53Fix835769, "aarch64-fix-cortex-a53-835769-pass", bool AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) { LLVM_DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n"); + auto &STI = F.getSubtarget<AArch64Subtarget>(); + // Fix not requested, skip pass. + if (!STI.fixCortexA53_835769()) + return false; + bool Changed = false; - TII = F.getSubtarget().getInstrInfo(); + TII = STI.getInstrInfo(); for (auto &MBB : F) { Changed |= runOnBasicBlock(MBB); diff --git a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index cd67e058a9c1..9e31243cd696 100644 --- a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -398,8 +398,8 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { TII = mf.getSubtarget().getInstrInfo(); // Just check things on a one-block-at-a-time basis. - for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) - if (processMachineBasicBlock(&*I)) + for (MachineBasicBlock &MBB : mf) + if (processMachineBasicBlock(&MBB)) Changed = true; return Changed; } diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index aeebb49675b2..85a9c04a3fef 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -73,6 +73,7 @@ class AArch64AsmPrinter : public AsmPrinter { StackMaps SM; FaultMaps FM; const AArch64Subtarget *STI; + bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) @@ -186,6 +187,10 @@ private: using MInstToMCSymbol = std::map<const MachineInstr *, MCSymbol *>; MInstToMCSymbol LOHInstToLabel; + + bool shouldEmitWeakSwiftAsyncExtendedFramePointerFlags() const override { + return ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags; + } }; } // end anonymous namespace @@ -1132,6 +1137,15 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) { if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; + if (MI->getOpcode() == AArch64::ADRP) { + for (auto &Opd : MI->operands()) { + if (Opd.isSymbol() && StringRef(Opd.getSymbolName()) == + "swift_async_extendedFramePointerFlags") { + ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = true; + } + } + } + if (AArch64FI->getLOHRelated().count(MI)) { // Generate a label for LOH related instruction MCSymbol *LOHLabel = createTempSymbol("loh"); diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index d2097f7e6ee3..1994e0eb7fb9 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -196,6 +196,13 @@ def mutate_anyext_to_zext : GICombineRule< (apply [{ applyMutateAnyExtToZExt(*${d}, MRI, B, Observer); }]) >; +def split_store_zero_128 : GICombineRule< + (defs root:$d), + (match (wip_match_opcode G_STORE):$d, + [{ return matchSplitStoreZero128(*${d}, MRI); }]), + (apply [{ applySplitStoreZero128(*${d}, MRI, B, Observer); }]) +>; + // Post-legalization combines which should happen at all optimization levels. // (E.g. ones that facilitate matching for the selector) For example, matching // pseudos. @@ -220,6 +227,7 @@ def AArch64PostLegalizerCombinerHelper icmp_to_true_false_known_bits, merge_unmerge, select_combines, fold_merge_to_zext, constant_fold, identity_combines, - ptr_add_immed_chain, overlapping_and]> { + ptr_add_immed_chain, overlapping_and, + split_store_zero_128]> { let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp index d98a5cfd4f50..4f324198f3dc 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandImm.cpp @@ -51,10 +51,9 @@ static bool tryToreplicateChunks(uint64_t UImm, ++Counts[getChunk(UImm, Idx)]; // Traverse the chunks to find one which occurs more than once. - for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end(); - Chunk != End; ++Chunk) { - const uint64_t ChunkVal = Chunk->first; - const unsigned Count = Chunk->second; + for (const auto &Chunk : Counts) { + const uint64_t ChunkVal = Chunk.first; + const unsigned Count = Chunk.second; uint64_t Encoding = 0; diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 209f9f7255a5..793663ef97d7 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -138,8 +138,8 @@ bool FalkorMarkStridedAccesses::run() { bool MadeChange = false; for (Loop *L : LI) - for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt) - MadeChange |= runOnLoop(**LIt); + for (Loop *LIt : depth_first(L)) + MadeChange |= runOnLoop(*LIt); return MadeChange; } @@ -828,10 +828,10 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { Modified = false; for (MachineLoop *I : LI) - for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) + for (MachineLoop *L : depth_first(I)) // Only process inner-loops if (L->isInnermost()) - runOnLoop(**L, Fn); + runOnLoop(*L, Fn); return Modified; } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index b630f4f0df5f..638e45b30d99 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -3041,10 +3041,21 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, // Create a buffer of SVE objects to allocate and sort it. SmallVector<int, 8> ObjectsToAllocate; + // If we have a stack protector, and we've previously decided that we have SVE + // objects on the stack and thus need it to go in the SVE stack area, then it + // needs to go first. + int StackProtectorFI = -1; + if (MFI.hasStackProtectorIndex()) { + StackProtectorFI = MFI.getStackProtectorIndex(); + if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector) + ObjectsToAllocate.push_back(StackProtectorFI); + } for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { unsigned StackID = MFI.getStackID(I); if (StackID != TargetStackID::ScalableVector) continue; + if (I == StackProtectorFI) + continue; if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex) continue; if (MFI.isDeadObjectIndex(I)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 72461aa1f772..e141179fb5c8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -204,6 +205,8 @@ static bool isMergePassthruOpcode(unsigned Opc) { return false; case AArch64ISD::BITREVERSE_MERGE_PASSTHRU: case AArch64ISD::BSWAP_MERGE_PASSTHRU: + case AArch64ISD::REVH_MERGE_PASSTHRU: + case AArch64ISD::REVW_MERGE_PASSTHRU: case AArch64ISD::CTLZ_MERGE_PASSTHRU: case AArch64ISD::CTPOP_MERGE_PASSTHRU: case AArch64ISD::DUP_MERGE_PASSTHRU: @@ -2227,6 +2230,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::STNP) MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) @@ -4213,6 +4218,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_revb: return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_revh: + return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_revw: + return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_sxtb: return DAG.getNode( AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), @@ -10958,16 +10969,15 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, EVT InVT = Op.getOperand(1).getValueType(); unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - if (InVT.isScalableVector()) { - SDLoc DL(Op); - EVT VT = Op.getValueType(); + SDValue Vec0 = Op.getOperand(0); + SDValue Vec1 = Op.getOperand(1); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + if (InVT.isScalableVector()) { if (!isTypeLegal(VT)) return SDValue(); - SDValue Vec0 = Op.getOperand(0); - SDValue Vec1 = Op.getOperand(1); - // Ensure the subvector is half the size of the main vector. if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2)) return SDValue(); @@ -10997,9 +11007,18 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, return SDValue(); } - // This will be matched by custom code during ISelDAGToDAG. - if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef()) - return Op; + if (Idx == 0 && isPackedVectorType(VT, DAG)) { + // This will be matched by custom code during ISelDAGToDAG. + if (Vec0.isUndef()) + return Op; + + unsigned int PredPattern = + getSVEPredPatternFromNumElements(InVT.getVectorNumElements()); + auto PredTy = VT.changeVectorElementType(MVT::i1); + SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern); + SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1); + return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0); + } return SDValue(); } @@ -11794,6 +11813,9 @@ bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, Base.getOperand(1).getOpcode() == ISD::SHL && Base.getOperand(1).hasOneUse() && Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { + // It's unknown whether a scalable vector has a power-of-2 bitwidth. + if (Mem->getMemoryVT().isScalableVector()) + return false; // The shift can be combined if it matches the size of the value being // loaded (and so reducing the width would make it not match). uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); @@ -15820,6 +15842,23 @@ static SDValue performVectorShiftCombine(SDNode *N, return SDValue(); } +static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) { + // sunpklo(sext(pred)) -> sext(extract_low_half(pred)) + // This transform works in partnership with performSetCCPunpkCombine to + // remove unnecessary transfer of predicates into standard registers and back + if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND && + N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() == + MVT::i1) { + SDValue CC = N->getOperand(0)->getOperand(0); + auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext()); + SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC, + DAG.getVectorIdxConstant(0, SDLoc(N))); + return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk); + } + + return SDValue(); +} + /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, @@ -15982,7 +16021,9 @@ static SDValue performSTORECombine(SDNode *N, if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND && Value.getNode()->hasOneUse() && ST->isUnindexed() && Subtarget->useSVEForFixedLengthVectors() && - Value.getValueType().isFixedLengthVector()) + Value.getValueType().isFixedLengthVector() && + Value.getValueType().getFixedSizeInBits() >= + Subtarget->getMinSVEVectorSizeInBits()) return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getMemoryVT(), ST->getMemOperand()); @@ -16495,6 +16536,44 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) { + // setcc_merge_zero pred + // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne + // => extract_subvector (inner setcc_merge_zero) + SDValue Pred = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get(); + + if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) || + LHS->getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + + SDValue Extract = LHS->getOperand(0); + if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR || + Extract->getValueType(0) != N->getValueType(0) || + Extract->getConstantOperandVal(1) != 0) + return SDValue(); + + SDValue InnerSetCC = Extract->getOperand(0); + if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO) + return SDValue(); + + // By this point we've effectively got + // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive + // lanes are already zero then the trunc(sext()) sequence is redundant and we + // can operate on A directly. + SDValue InnerPred = InnerSetCC.getOperand(0); + if (Pred.getOpcode() == AArch64ISD::PTRUE && + InnerPred.getOpcode() == AArch64ISD::PTRUE && + Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) && + Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 && + Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256) + return Extract; + + return SDValue(); +} + static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && "Unexpected opcode!"); @@ -16513,6 +16592,9 @@ static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) { LHS->getOperand(0)->getOperand(0) == Pred) return LHS->getOperand(0); + if (SDValue V = performSetCCPunpkCombine(N, DAG)) + return V; + return SDValue(); } @@ -17343,7 +17425,8 @@ SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, // they can be split down into something legal. if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() && - VT.isFixedLengthVector()) { + VT.isFixedLengthVector() && + VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) { LoadSDNode *LN0 = cast<LoadSDNode>(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), @@ -17455,6 +17538,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case AArch64ISD::VASHR: case AArch64ISD::VLSHR: return performVectorShiftCombine(N, *this, DCI); + case AArch64ISD::SUNPKLO: + return performSunpkloCombine(N, DAG); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: @@ -18570,7 +18655,25 @@ AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { } void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { - MF.getFrameInfo().computeMaxCallFrameSize(MF); + MachineFrameInfo &MFI = MF.getFrameInfo(); + // If we have any vulnerable SVE stack objects then the stack protector + // needs to be placed at the top of the SVE stack area, as the SVE locals + // are placed above the other locals, so we allocate it as if it were a + // scalable vector. + // FIXME: It may be worthwhile having a specific interface for this rather + // than doing it here in finalizeLowering. + if (MFI.hasStackProtectorIndex()) { + for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) { + if (MFI.getStackID(i) == TargetStackID::ScalableVector && + MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) { + MFI.setStackID(MFI.getStackProtectorIndex(), + TargetStackID::ScalableVector); + MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16)); + break; + } + } + } + MFI.computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } @@ -18855,10 +18958,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE( SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( SDValue Op, SelectionDAG &DAG) const { - auto Store = cast<MaskedStoreSDNode>(Op); - - if (Store->isTruncatingStore()) - return SDValue(); + auto *Store = cast<MaskedStoreSDNode>(Op); SDLoc DL(Op); EVT VT = Store->getValue().getValueType(); @@ -19103,7 +19203,7 @@ SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op, if (isMergePassthruOpcode(NewOp)) Operands.push_back(DAG.getUNDEF(VT)); - return DAG.getNode(NewOp, DL, VT, Operands); + return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags()); } // If a fixed length vector operation has no side effects when applied to @@ -19498,6 +19598,94 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( return convertFromScalableVector(DAG, VT, Op); } + for (unsigned LaneSize : {64U, 32U, 16U}) { + if (isREVMask(ShuffleMask, VT, LaneSize)) { + EVT NewVT = + getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize)); + unsigned RevOp; + unsigned EltSz = VT.getScalarSizeInBits(); + if (EltSz == 8) + RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU; + else if (EltSz == 16) + RevOp = AArch64ISD::REVH_MERGE_PASSTHRU; + else + RevOp = AArch64ISD::REVW_MERGE_PASSTHRU; + + Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1); + Op = LowerToPredicatedOp(Op, DAG, RevOp); + Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op); + return convertFromScalableVector(DAG, VT, Op); + } + } + + unsigned WhichResult; + if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0) + return convertFromScalableVector( + DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2)); + + if (isTRNMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; + return convertFromScalableVector( + DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); + } + + if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0) + return convertFromScalableVector( + DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1)); + + if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; + return convertFromScalableVector( + DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); + } + + // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask + // represents the same logical operation as performed by a ZIP instruction. In + // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly + // equivalent to an AArch64 instruction. There's the extra component of + // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions + // only operated on 64/128bit vector types that have a direct mapping to a + // target register and so an exact mapping is implied. + // However, when using SVE for fixed length vectors, most legal vector types + // are actually sub-vectors of a larger SVE register. When mapping + // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider + // how the mask's indices translate. Specifically, when the mapping requires + // an exact meaning for a specific vector index (e.g. Index X is the last + // vector element in the register) then such mappings are often only safe when + // the exact SVE register size is know. The main exception to this is when + // indices are logically relative to the first element of either + // ISD::VECTOR_SHUFFLE operand because these relative indices don't change + // when converting from fixed-length to scalable vector types (i.e. the start + // of a fixed length vector is always the start of a scalable vector). + unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits(); + unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits(); + if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) { + if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) { + Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1); + return convertFromScalableVector(DAG, VT, Op); + } + + if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0) + return convertFromScalableVector( + DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2)); + + if (isUZPMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; + return convertFromScalableVector( + DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); + } + + if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0) + return convertFromScalableVector( + DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1)); + + if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; + return convertFromScalableVector( + DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1)); + } + } + return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ea884cdccd28..367ba3039a0c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -324,6 +324,8 @@ enum NodeType : unsigned { BITREVERSE_MERGE_PASSTHRU, BSWAP_MERGE_PASSTHRU, + REVH_MERGE_PASSTHRU, + REVW_MERGE_PASSTHRU, CTLZ_MERGE_PASSTHRU, CTPOP_MERGE_PASSTHRU, DUP_MERGE_PASSTHRU, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index cd4bc8a61a8a..f8d492188744 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -387,16 +387,16 @@ def simm7s16 : Operand<i32> { let PrintMethod = "printImmScale<16>"; } -def am_sve_fi : ComplexPattern<i64, 2, "SelectAddrModeFrameIndexSVE", []>; +def am_sve_fi : ComplexPattern<iPTR, 2, "SelectAddrModeFrameIndexSVE", []>; -def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>; -def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>; -def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>; -def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>; -def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; +def am_indexed7s8 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S8", []>; +def am_indexed7s16 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S16", []>; +def am_indexed7s32 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S32", []>; +def am_indexed7s64 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S64", []>; +def am_indexed7s128 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed7S128", []>; -def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>; -def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>; +def am_indexedu6s128 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedU6S128", []>; +def am_indexeds9s128 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedS9S128", []>; def UImmS1XForm : SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64); @@ -3177,18 +3177,18 @@ def maski16_or_more : Operand<i32>, // (unsigned immediate) // Indexed for 8-bit registers. offset is in range [0,4095]. -def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>; -def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>; -def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>; -def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>; -def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>; +def am_indexed8 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed8", []>; +def am_indexed16 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed16", []>; +def am_indexed32 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed32", []>; +def am_indexed64 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed64", []>; +def am_indexed128 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexed128", []>; // (unsigned immediate) // Indexed for 8-bit registers. offset is in range [0,63]. -def am_indexed8_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<1,63>", []>; -def am_indexed16_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<2,63>", []>; -def am_indexed32_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<4,63>", []>; -def am_indexed64_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<8,63>", []>; +def am_indexed8_6b : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedUImm<1,63>", []>; +def am_indexed16_6b : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedUImm<2,63>", []>; +def am_indexed32_6b : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedUImm<4,63>", []>; +def am_indexed64_6b : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedUImm<8,63>", []>; def gi_am_indexed8 : GIComplexOperandMatcher<s64, "selectAddrModeIndexed<8>">, @@ -3358,11 +3358,11 @@ class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat> // Load/store register offset //--- -def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>; -def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>; -def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>; -def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>; -def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>; +def ro_Xindexed8 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<8>", []>; +def ro_Xindexed16 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<16>", []>; +def ro_Xindexed32 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<32>", []>; +def ro_Xindexed64 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<64>", []>; +def ro_Xindexed128 : ComplexPattern<iPTR, 4, "SelectAddrModeXRO<128>", []>; def gi_ro_Xindexed8 : GIComplexOperandMatcher<s64, "selectAddrModeXRO<8>">, @@ -3380,11 +3380,11 @@ def gi_ro_Xindexed128 : GIComplexOperandMatcher<s64, "selectAddrModeXRO<128>">, GIComplexPatternEquiv<ro_Xindexed128>; -def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>; -def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>; -def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>; -def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>; -def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>; +def ro_Windexed8 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<8>", []>; +def ro_Windexed16 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<16>", []>; +def ro_Windexed32 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<32>", []>; +def ro_Windexed64 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<64>", []>; +def ro_Windexed128 : ComplexPattern<iPTR, 4, "SelectAddrModeWRO<128>", []>; def gi_ro_Windexed8 : GIComplexOperandMatcher<s64, "selectAddrModeWRO<8>">, @@ -3880,11 +3880,11 @@ multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> { // Load/store unscaled immediate //--- -def am_unscaled8 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>; -def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>; -def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>; -def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>; -def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>; +def am_unscaled8 : ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled8", []>; +def am_unscaled16 : ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled16", []>; +def am_unscaled32 : ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled32", []>; +def am_unscaled64 : ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled64", []>; +def am_unscaled128 :ComplexPattern<iPTR, 2, "SelectAddrModeUnscaled128", []>; def gi_am_unscaled8 : GIComplexOperandMatcher<s64, "selectAddrModeUnscaled8">, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index f8f8ee3f1e6c..5fc5e4e5eb35 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -7055,6 +7055,8 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, unsigned &Flags) const { + if (!TargetInstrInfo::isMBBSafeToOutlineFrom(MBB, Flags)) + return false; // Check if LR is available through all of the MBB. If it's not, then set // a flag. assert(MBB.getParent()->getRegInfo().tracksLiveness() && diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index decee117d2d5..ebccc07edc7a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4174,6 +4174,21 @@ defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>; defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>; defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>; +def : Pat<(v8i8 (AArch64vashr (v8i8 V64:$Rn), (i32 7))), + (CMLTv8i8rz V64:$Rn)>; +def : Pat<(v4i16 (AArch64vashr (v4i16 V64:$Rn), (i32 15))), + (CMLTv4i16rz V64:$Rn)>; +def : Pat<(v2i32 (AArch64vashr (v2i32 V64:$Rn), (i32 31))), + (CMLTv2i32rz V64:$Rn)>; +def : Pat<(v16i8 (AArch64vashr (v16i8 V128:$Rn), (i32 7))), + (CMLTv16i8rz V128:$Rn)>; +def : Pat<(v8i16 (AArch64vashr (v8i16 V128:$Rn), (i32 15))), + (CMLTv8i16rz V128:$Rn)>; +def : Pat<(v4i32 (AArch64vashr (v4i32 V128:$Rn), (i32 31))), + (CMLTv4i32rz V128:$Rn)>; +def : Pat<(v2i64 (AArch64vashr (v2i64 V128:$Rn), (i32 63))), + (CMLTv2i64rz V128:$Rn)>; + defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>; defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; @@ -4363,6 +4378,32 @@ def : Pat<(v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)), (v4i32 VImm8000)))), (SQXTNv4i16 V128:$Vn)>; +// concat_vectors(Vd, trunc(smin(smax Vm, -128), 127) ~> SQXTN2(Vd, Vn) +// with reversed min/max +def : Pat<(v16i8 (concat_vectors + (v8i8 V64:$Vd), + (v8i8 (trunc (smin (smax (v8i16 V128:$Vn), (v8i16 VImm80)), + (v8i16 VImm7F)))))), + (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; +def : Pat<(v16i8 (concat_vectors + (v8i8 V64:$Vd), + (v8i8 (trunc (smax (smin (v8i16 V128:$Vn), (v8i16 VImm7F)), + (v8i16 VImm80)))))), + (SQXTNv16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; + +// concat_vectors(Vd, trunc(smin(smax Vm, -32768), 32767) ~> SQXTN2(Vd, Vn) +// with reversed min/max +def : Pat<(v8i16 (concat_vectors + (v4i16 V64:$Vd), + (v4i16 (trunc (smin (smax (v4i32 V128:$Vn), (v4i32 VImm8000)), + (v4i32 VImm7FFF)))))), + (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; +def : Pat<(v8i16 (concat_vectors + (v4i16 V64:$Vd), + (v4i16 (trunc (smax (smin (v4i32 V128:$Vn), (v4i32 VImm7FFF)), + (v4i32 VImm8000)))))), + (SQXTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn)>; + //===----------------------------------------------------------------------===// // Advanced SIMD three vector instructions. //===----------------------------------------------------------------------===// @@ -4825,6 +4866,9 @@ defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", int_aarch64_neon_usqadd>; +def : Pat<(v1i64 (AArch64vashr (v1i64 V64:$Rn), (i32 63))), + (CMLTv1i64rz V64:$Rn)>; + def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))), (FCVTASv1i64 FPR64:$Rn)>; def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))), @@ -5288,6 +5332,29 @@ defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>; defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>; defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>; +def : Pat<(v16i8 (concat_vectors (v8i8 (trunc (v8i16 V128:$Vn))), + (v8i8 (trunc (v8i16 V128:$Vm))))), + (UZP1v16i8 V128:$Vn, V128:$Vm)>; +def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))), + (v4i16 (trunc (v4i32 V128:$Vm))))), + (UZP1v8i16 V128:$Vn, V128:$Vm)>; +def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))), + (v2i32 (trunc (v2i64 V128:$Vm))))), + (UZP1v4i32 V128:$Vn, V128:$Vm)>; + +def : Pat<(v16i8 (concat_vectors + (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))), + (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vm), (i32 8)))))), + (UZP2v16i8 V128:$Vn, V128:$Vm)>; +def : Pat<(v8i16 (concat_vectors + (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vn), (i32 16)))), + (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vm), (i32 16)))))), + (UZP2v8i16 V128:$Vn, V128:$Vm)>; +def : Pat<(v4i32 (concat_vectors + (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vn), (i32 32)))), + (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))), + (UZP2v4i32 V128:$Vn, V128:$Vm)>; + //---------------------------------------------------------------------------- // AdvSIMD TBL/TBX instructions //---------------------------------------------------------------------------- @@ -6536,6 +6603,34 @@ defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; +// RADDHN patterns for when RSHRN shifts by half the size of the vector element +def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))), + (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))), + (RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))), + (RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>; + +// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element +def : Pat<(v16i8 (concat_vectors + (v8i8 V64:$Vd), + (v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))))), + (RADDHNv8i16_v16i8 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v8i16 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v8i16 (concat_vectors + (v4i16 V64:$Vd), + (v4i16 (int_aarch64_neon_rshrn (v4i32 V128:$Vn), (i32 16))))), + (RADDHNv4i32_v8i16 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v4i32 (MOVIv2d_ns (i32 0))))>; +def : Pat<(v4i32 (concat_vectors + (v2i32 V64:$Vd), + (v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))))), + (RADDHNv2i64_v4i32 + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, + (v2i64 (MOVIv2d_ns (i32 0))))>; + // SHRN patterns for when a logical right shift was used instead of arithmetic // (the immediate guarantees no sign bits actually end up in the result so it // doesn't matter). diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 25d53f4ab065..eb55a472a69a 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -136,15 +136,15 @@ def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER // // SVE CNT/INC/RDVL -def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">; -def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">; -def sve_cntw_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 4>">; -def sve_cntd_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 2>">; +def sve_rdvl_imm : ComplexPattern<i64, 1, "SelectRDVLImm<-32, 31, 16>">; +def sve_cnth_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 8>">; +def sve_cntw_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 4>">; +def sve_cntd_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, 2>">; // SVE DEC -def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">; -def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">; -def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">; +def sve_cnth_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -8>">; +def sve_cntw_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -4>">; +def sve_cntd_imm_neg : ComplexPattern<i64, 1, "SelectRDVLImm<1, 16, -2>">; def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>; def AArch64faddv_p : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>; @@ -231,6 +231,8 @@ def AArch64fsqrt_mt : SDNode<"AArch64ISD::FSQRT_MERGE_PASSTHRU", SDT_AArch64Ari def AArch64frecpx_mt : SDNode<"AArch64ISD::FRECPX_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64rbit_mt : SDNode<"AArch64ISD::BITREVERSE_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revh_mt : SDNode<"AArch64ISD::REVH_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revw_mt : SDNode<"AArch64ISD::REVW_MERGE_PASSTHRU", SDT_AArch64Arith>; // These are like the above but we don't yet have need for ISD nodes. They allow // a single pattern to match intrinsic and ISD operand layouts. @@ -275,6 +277,11 @@ def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), return N->hasOneUse(); }]>; +def AArch64fneg_mt_nsz : PatFrag<(ops node:$pred, node:$op, node:$pt), + (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{ + return N->getFlags().hasNoSignedZeros(); +}]>; + def SDT_AArch64Arith_Unpred : SDTypeProfile<1, 2, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2> @@ -536,7 +543,8 @@ let Predicates = [HasSVEorStreamingSVE] in { (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; // Zd = -(Za + Zn * Zm) - def : Pat<(AArch64fneg_mt PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))), + // (with nsz neg.) + def : Pat<(AArch64fneg_mt_nsz PredTy:$P, (AArch64fma_p PredTy:$P, Ty:$Zn, Ty:$Zm, Ty:$Za), (Ty (undef))), (!cast<Instruction>("FNMLA_ZPZZZ_UNDEF_"#Suffix) $P, ZPR:$Za, ZPR:$Zn, ZPR:$Zm)>; // Zda = Zda + Zn * Zm @@ -624,13 +632,13 @@ let Predicates = [HasSVEorStreamingSVE] in { def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>; // Duplicate Int immediate into all vector elements - def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), (DUP_ZI_B $a, $b)>; - def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), (DUP_ZI_H $a, $b)>; - def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))), + def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm32 i32:$a, i32:$b)))), (DUP_ZI_S $a, $b)>; - def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))), + def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm64 i32:$a, i32:$b)))), (DUP_ZI_D $a, $b)>; // Duplicate immediate FP into all vector elements. @@ -674,8 +682,8 @@ let Predicates = [HasSVEorStreamingSVE] in { defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", AArch64rbit_mt>; defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", AArch64revb_mt>; - defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>; - defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>; + defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", AArch64revh_mt>; + defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", AArch64revw_mt>; defm REV_PP : sve_int_perm_reverse_p<"rev", vector_reverse>; defm REV_ZZ : sve_int_perm_reverse_z<"rev", vector_reverse>; @@ -2686,13 +2694,13 @@ let Predicates = [HasSVEorStreamingSVE] in { // Splice with lane bigger or equal to 0 def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))), - (EXT_ZZI ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_255:$index)>; + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))), - (EXT_ZZI ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_127:$index)>; + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))), - (EXT_ZZI ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_63:$index)>; + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))), - (EXT_ZZI ZPR:$Z1, ZPR:$Z2, sve_ext_imm_0_31:$index)>; + (EXT_ZZI ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>; } // End HasSVEorStreamingSVE diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 5cec4cb66339..566c7a16db23 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -488,7 +488,7 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) { const Align NewAlignment = - max(MaybeAlign(Info.AI->getAlignment()), kTagGranuleSize); + max(MaybeAlign(Info.AI->getAlign()), kTagGranuleSize); Info.AI->setAlignment(NewAlignment); uint64_t Size = Info.AI->getAllocationSizeInBits(*DL).getValue() / 8; @@ -537,15 +537,14 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { SmallVector<Instruction *, 4> UnrecognizedLifetimes; for (auto &BB : *F) { - for (BasicBlock::iterator IT = BB.begin(); IT != BB.end(); ++IT) { - Instruction *I = &*IT; - if (auto *AI = dyn_cast<AllocaInst>(I)) { + for (Instruction &I : BB) { + if (auto *AI = dyn_cast<AllocaInst>(&I)) { Allocas[AI].AI = AI; Allocas[AI].OldAI = AI; continue; } - if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(I)) { + if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) { for (Value *V : DVI->location_ops()) if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) if (Allocas[AI].DbgVariableIntrinsics.empty() || @@ -554,12 +553,12 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { continue; } - auto *II = dyn_cast<IntrinsicInst>(I); + auto *II = dyn_cast<IntrinsicInst>(&I); if (II && (II->getIntrinsicID() == Intrinsic::lifetime_start || II->getIntrinsicID() == Intrinsic::lifetime_end)) { AllocaInst *AI = findAllocaForValue(II->getArgOperand(1)); if (!AI) { - UnrecognizedLifetimes.push_back(I); + UnrecognizedLifetimes.push_back(&I); continue; } if (II->getIntrinsicID() == Intrinsic::lifetime_start) @@ -568,8 +567,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { Allocas[AI].LifetimeEnd.push_back(II); } - if (isa<ReturnInst>(I) || isa<ResumeInst>(I) || isa<CleanupReturnInst>(I)) - RetVec.push_back(I); + if (isa<ReturnInst, ResumeInst, CleanupReturnInst>(&I)) + RetVec.push_back(&I); } } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index d782d6352cbe..f7d3dd0bc222 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -346,9 +346,7 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const { return false; if (TargetTriple.isiOS()) { - unsigned Major, Minor, Micro; - TargetTriple.getiOSVersion(Major, Minor, Micro); - return Major >= 8; + return TargetTriple.getiOSVersion() >= VersionTuple(8); } return false; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 19db774ccd7b..b3cd5ebd5f65 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -116,6 +116,8 @@ protected: bool HasFP16FML = false; bool HasSPE = false; + bool FixCortexA53_835769 = false; + // ARMv8.1 extensions bool HasVH = false; bool HasPAN = false; @@ -571,6 +573,8 @@ public: bool hasEL2VMSA() const { return HasEL2VMSA; } bool hasEL3() const { return HasEL3; } + bool fixCortexA53_835769() const { return FixCortexA53_835769; } + bool addrSinkUsingGEPs() const override { // Keeping GEPs inbounds is important for exploiting AArch64 // addressing-modes in ILP32 mode. @@ -632,8 +636,7 @@ public: // extended frames should be flagged as present. const Triple &TT = getTargetTriple(); - unsigned Major, Minor, Micro; - TT.getOSVersion(Major, Minor, Micro); + unsigned Major = TT.getOSVersion().getMajor(); switch(TT.getOS()) { default: return false; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index ce26c62af61a..4af28fc070dd 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -117,11 +117,6 @@ static cl::opt<bool> cl::init(true), cl::Hidden); static cl::opt<bool> -EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden, - cl::desc("Work around Cortex-A53 erratum 835769"), - cl::init(false)); - -static cl::opt<bool> EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), cl::init(false)); @@ -382,10 +377,9 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const { unsigned MaxSVEVectorSize = 0; Attribute VScaleRangeAttr = F.getFnAttribute(Attribute::VScaleRange); if (VScaleRangeAttr.isValid()) { - std::tie(MinSVEVectorSize, MaxSVEVectorSize) = - VScaleRangeAttr.getVScaleRangeArgs(); - MinSVEVectorSize *= 128; - MaxSVEVectorSize *= 128; + Optional<unsigned> VScaleMax = VScaleRangeAttr.getVScaleRangeMax(); + MinSVEVectorSize = VScaleRangeAttr.getVScaleRangeMin() * 128; + MaxSVEVectorSize = VScaleMax ? VScaleMax.getValue() * 128 : 0; } else { MinSVEVectorSize = SVEVectorBitsMinOpt; MaxSVEVectorSize = SVEVectorBitsMaxOpt; @@ -765,8 +759,7 @@ void AArch64PassConfig::addPreEmitPass() { if (TM->getOptLevel() >= CodeGenOpt::Aggressive && EnableLoadStoreOpt) addPass(createAArch64LoadStoreOptimizationPass()); - if (EnableA53Fix835769) - addPass(createAArch64A53Fix835769()); + addPass(createAArch64A53Fix835769()); if (EnableBranchTargets) addPass(createAArch64BranchTargetsPass()); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 34015d2dbd49..d21854e38f5a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -30,6 +30,12 @@ using namespace llvm::PatternMatch; static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden); +static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10), + cl::Hidden); + +static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead", + cl::init(10), cl::Hidden); + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -725,6 +731,22 @@ static Optional<Instruction *> instCombineSVEVectorFMLA(InstCombiner &IC, return IC.replaceInstUsesWith(II, FMLA); } +static bool isAllActivePredicate(Value *Pred) { + // Look through convert.from.svbool(convert.to.svbool(...) chain. + Value *UncastedPred; + if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( + m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( + m_Value(UncastedPred))))) + // If the predicate has the same or less lanes than the uncasted + // predicate then we know the casting has no effect. + if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <= + cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements()) + Pred = UncastedPred; + + return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( + m_ConstantInt<AArch64SVEPredPattern::all>())); +} + static Optional<Instruction *> instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { IRBuilder<> Builder(II.getContext()); @@ -735,8 +757,7 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { Type *VecTy = II.getType(); Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo()); - if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( - m_ConstantInt<AArch64SVEPredPattern::all>()))) { + if (isAllActivePredicate(Pred)) { LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr); return IC.replaceInstUsesWith(II, Load); } @@ -758,8 +779,7 @@ instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { Value *VecPtr = Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo()); - if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( - m_ConstantInt<AArch64SVEPredPattern::all>()))) { + if (isAllActivePredicate(Pred)) { Builder.CreateStore(VecOp, VecPtr); return IC.eraseInstFromFunction(II); } @@ -1008,6 +1028,40 @@ static Optional<Instruction *> instCombineST1ScatterIndex(InstCombiner &IC, return None; } +static Optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, + IntrinsicInst &II) { + IRBuilder<> Builder(II.getContext()); + Builder.SetInsertPoint(&II); + Type *Int32Ty = Builder.getInt32Ty(); + Value *Pred = II.getOperand(0); + Value *Vec = II.getOperand(1); + Value *DivVec = II.getOperand(2); + + Value *SplatValue = getSplatValue(DivVec); + ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue); + if (!SplatConstantInt) + return None; + APInt Divisor = SplatConstantInt->getValue(); + + if (Divisor.isPowerOf2()) { + Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); + auto ASRD = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); + return IC.replaceInstUsesWith(II, ASRD); + } + if (Divisor.isNegatedPowerOf2()) { + Divisor.negate(); + Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2()); + auto ASRD = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); + auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg, + {ASRD->getType()}, {ASRD, Pred, ASRD}); + return IC.replaceInstUsesWith(II, NEG); + } + + return None; +} + Optional<Instruction *> AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { @@ -1068,6 +1122,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVELD1(IC, II, DL); case Intrinsic::aarch64_sve_st1: return instCombineSVEST1(IC, II, DL); + case Intrinsic::aarch64_sve_sdiv: + return instCombineSVESDIV(IC, II); } return None; @@ -1746,7 +1802,7 @@ InstructionCost AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) { - if (!isa<ScalableVectorType>(Src)) + if (useNeonVector(Src)) return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); auto LT = TLI->getTypeLegalizationCost(DL, Src); @@ -1763,6 +1819,10 @@ AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, return LT.first * 2; } +static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { + return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; +} + InstructionCost AArch64TTIImpl::getGatherScatterOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { @@ -1785,6 +1845,10 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost( ElementCount LegalVF = LT.second.getVectorElementCount(); InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); + // Add on an overhead cost for using gathers/scatters. + // TODO: At the moment this is applied unilaterally for all CPUs, but at some + // point we may want a per-CPU overhead. + MemOpCost *= getSVEGatherScatterOverhead(Opcode); return LT.first * MemOpCost * getMaxNumElements(LegalVF); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index d1e8cd204b3a..c3e1735cd4cd 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -309,6 +309,8 @@ public: bool supportsScalableVectors() const { return ST->hasSVE(); } + bool enableScalableVectorization() const { return ST->hasSVE(); } + bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const; diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 6d3aea2721de..62038b10fccd 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -1031,12 +1031,7 @@ public: if (DarwinRefKind != MCSymbolRefExpr::VK_None) return false; - for (unsigned i = 0; i != AllowedModifiers.size(); ++i) { - if (ELFRefKind == AllowedModifiers[i]) - return true; - } - - return false; + return llvm::is_contained(AllowedModifiers, ELFRefKind); } bool isMovWSymbolG3() const { diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 1524aa5eb0ec..e8894e7933d6 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -785,6 +785,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .libcallFor({s128}) .minScalar(0, MinFPScalar); + // TODO: Vector types. + getActionDefinitionsBuilder({G_FMAXIMUM, G_FMINIMUM}) + .legalFor({MinFPScalar, s32, s64}) + .minScalar(0, MinFPScalar); + // TODO: Libcall support for s128. // TODO: s16 should be legal with full FP16 support. getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index a9b3792e0118..3dec980a819a 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -289,6 +289,44 @@ static void applyMutateAnyExtToZExt(MachineInstr &MI, MachineRegisterInfo &MRI, Observer.changedInstr(MI); } +/// Match a 128b store of zero and split it into two 64 bit stores, for +/// size/performance reasons. +static bool matchSplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI) { + GStore &Store = cast<GStore>(MI); + if (!Store.isSimple()) + return false; + LLT ValTy = MRI.getType(Store.getValueReg()); + if (!ValTy.isVector() || ValTy.getSizeInBits() != 128) + return false; + if (ValTy.getSizeInBits() != Store.getMemSizeInBits()) + return false; // Don't split truncating stores. + if (!MRI.hasOneNonDBGUse(Store.getValueReg())) + return false; + auto MaybeCst = isConstantOrConstantSplatVector( + *MRI.getVRegDef(Store.getValueReg()), MRI); + return MaybeCst && MaybeCst->isZero(); +} + +static void applySplitStoreZero128(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, + GISelChangeObserver &Observer) { + B.setInstrAndDebugLoc(MI); + GStore &Store = cast<GStore>(MI); + assert(MRI.getType(Store.getValueReg()).isVector() && + "Expected a vector store value"); + LLT NewTy = LLT::scalar(64); + Register PtrReg = Store.getPointerReg(); + auto Zero = B.buildConstant(NewTy, 0); + auto HighPtr = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg, + B.buildConstant(LLT::scalar(64), 8)); + auto &MF = *MI.getMF(); + auto *LowMMO = MF.getMachineMemOperand(&Store.getMMO(), 0, NewTy); + auto *HighMMO = MF.getMachineMemOperand(&Store.getMMO(), 8, NewTy); + B.buildStore(Zero, PtrReg, *LowMMO); + B.buildStore(Zero, HighPtr, *HighMMO); + Store.eraseFromParent(); +} + #define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AArch64GenPostLegalizeGICombiner.inc" #undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 40ddf6a94f73..515a5c63a559 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -430,6 +430,8 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) { case TargetOpcode::G_INTRINSIC_ROUND: case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINNUM: + case TargetOpcode::G_FMAXIMUM: + case TargetOpcode::G_FMINIMUM: return true; } return false; @@ -600,6 +602,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_FSUB: case TargetOpcode::G_FMUL: case TargetOpcode::G_FDIV: + case TargetOpcode::G_FMAXIMUM: + case TargetOpcode::G_FMINIMUM: return getSameKindOfOperandsMapping(MI); case TargetOpcode::G_FPEXT: { LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index 90688f1a3e83..c1186ae804d2 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -239,8 +239,8 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) { {codeview::RegisterId::ARM64_Q31, AArch64::Q31}, }; - for (unsigned I = 0; I < array_lengthof(RegMap); ++I) - MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg)); + for (const auto &I : RegMap) + MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg)); } static MCRegisterInfo *createAArch64MCRegisterInfo(const Triple &Triple) { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 010ffa1502de..bb488cd7da32 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -197,34 +197,42 @@ def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64 def SVEAddSubImm8Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>; def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>; def SVEAddSubImm32Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i32>", []>; -def SVEAddSubImm64Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i64>", []>; +def SVEAddSubImm64Pat : ComplexPattern<i64, 2, "SelectSVEAddSubImm<MVT::i64>", []>; -def SVELogicalImm8Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i8>", []>; -def SVELogicalImm16Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16>", []>; -def SVELogicalImm32Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32>", []>; +def SVELogicalImm8Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8>", []>; +def SVELogicalImm16Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16>", []>; +def SVELogicalImm32Pat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32>", []>; def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>; -def SVELogicalImm8NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i8, true>", []>; -def SVELogicalImm16NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16, true>", []>; -def SVELogicalImm32NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32, true>", []>; +def SVELogicalImm8NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i8, true>", []>; +def SVELogicalImm16NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i16, true>", []>; +def SVELogicalImm32NotPat : ComplexPattern<i32, 1, "SelectSVELogicalImm<MVT::i32, true>", []>; def SVELogicalImm64NotPat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64, true>", []>; -def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>; +def SVE8BitLslImm32 : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>; +def SVE8BitLslImm64 : ComplexPattern<i64, 2, "SelectSVE8BitLslImm", [imm]>; +class SVE8BitLslImm<ValueType ty> { + ComplexPattern Pat = !cond( + !eq(ty, i32): SVE8BitLslImm32, + !eq(ty, i64): SVE8BitLslImm64); +} def SVEArithUImm8Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i8>", []>; def SVEArithUImm16Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i16>", []>; def SVEArithUImm32Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i32>", []>; -def SVEArithUImm64Pat : ComplexPattern<i32, 1, "SelectSVEArithImm<MVT::i64>", []>; -def SVEArithSImmPat : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>; +def SVEArithUImm64Pat : ComplexPattern<i64, 1, "SelectSVEArithImm<MVT::i64>", []>; + +def SVEArithSImmPat32 : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>; +def SVEArithSImmPat64 : ComplexPattern<i64, 1, "SelectSVESignedArithImm", []>; def SVEShiftImmL8 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 7>", []>; def SVEShiftImmL16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 15>", []>; def SVEShiftImmL32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 31>", []>; -def SVEShiftImmL64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 63>", []>; +def SVEShiftImmL64 : ComplexPattern<i64, 1, "SelectSVEShiftImm<0, 63>", []>; def SVEShiftImmR8 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 8, true>", []>; def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []>; def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>; -def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>; +def SVEShiftImmR64 : ComplexPattern<i64, 1, "SelectSVEShiftImm<1, 64, true>", []>; def SVEAllActive : ComplexPattern<untyped, 0, "SelectAllActivePredicate", []>; @@ -260,14 +268,14 @@ def sve_incdec_imm : Operand<i32>, TImmLeaf<i32, [{ } // This allows i32 immediate extraction from i64 based arithmetic. -def sve_cnt_mul_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">; -def sve_cnt_shl_imm : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, true>">; - +def sve_cnt_mul_imm_i32 : ComplexPattern<i32, 1, "SelectCntImm<1, 16, 1, false>">; +def sve_cnt_mul_imm_i64 : ComplexPattern<i64, 1, "SelectCntImm<1, 16, 1, false>">; +def sve_cnt_shl_imm : ComplexPattern<i64, 1, "SelectCntImm<1, 16, 1, true>">; -def sve_ext_imm_0_31 : ComplexPattern<i32, 1, "SelectEXTImm<31, 8>">; -def sve_ext_imm_0_63 : ComplexPattern<i32, 1, "SelectEXTImm<63, 4>">; -def sve_ext_imm_0_127 : ComplexPattern<i32, 1, "SelectEXTImm<127, 2>">; -def sve_ext_imm_0_255 : ComplexPattern<i32, 1, "SelectEXTImm<255, 1>">; +def sve_ext_imm_0_31 : ComplexPattern<i64, 1, "SelectEXTImm<31, 8>">; +def sve_ext_imm_0_63 : ComplexPattern<i64, 1, "SelectEXTImm<63, 4>">; +def sve_ext_imm_0_127 : ComplexPattern<i64, 1, "SelectEXTImm<127, 2>">; +def sve_ext_imm_0_255 : ComplexPattern<i64, 1, "SelectEXTImm<255, 1>">; def int_aarch64_sve_cntp_oneuse : PatFrag<(ops node:$pred, node:$src2), (int_aarch64_sve_cntp node:$pred, node:$src2), [{ @@ -435,8 +443,8 @@ class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1, : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))), (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>; -def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>; -def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>; +def SVEDup0 : ComplexPattern<vAny, 0, "SelectDupZero", []>; +def SVEDup0Undef : ComplexPattern<vAny, 0, "SelectDupZeroOrUndef", []>; let AddedComplexity = 1 in { class SVE_3_Op_Pat_SelZero<ValueType vtd, SDPatternOperator op, ValueType vt1, @@ -868,10 +876,10 @@ multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> { def : InstAlias<asm # "\t$Rd", (!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>; - def : Pat<(i64 (mul (op sve_pred_enum:$pattern), (sve_cnt_mul_imm i32:$imm))), + def : Pat<(i64 (mul (op sve_pred_enum:$pattern), (sve_cnt_mul_imm_i64 i32:$imm))), (!cast<Instruction>(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>; - def : Pat<(i64 (shl (op sve_pred_enum:$pattern), (i64 (sve_cnt_shl_imm i32:$imm)))), + def : Pat<(i64 (shl (op sve_pred_enum:$pattern), (sve_cnt_shl_imm i32:$imm))), (!cast<Instruction>(NAME) sve_pred_enum:$pattern, sve_incdec_imm:$imm)>; def : Pat<(i64 (op sve_pred_enum:$pattern)), @@ -951,10 +959,10 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm, def : Pat<(i64 (op GPR64:$Rdn, (opcnt sve_pred_enum:$pattern))), (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, 1)>; - def : Pat<(i64 (op GPR64:$Rdn, (mul (opcnt sve_pred_enum:$pattern), (sve_cnt_mul_imm i32:$imm)))), + def : Pat<(i64 (op GPR64:$Rdn, (mul (opcnt sve_pred_enum:$pattern), (sve_cnt_mul_imm_i64 i32:$imm)))), (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>; - def : Pat<(i64 (op GPR64:$Rdn, (shl (opcnt sve_pred_enum:$pattern), (i64 (sve_cnt_shl_imm i32:$imm))))), + def : Pat<(i64 (op GPR64:$Rdn, (shl (opcnt sve_pred_enum:$pattern), (sve_cnt_shl_imm i32:$imm)))), (!cast<Instruction>(NAME) GPR64:$Rdn, sve_pred_enum:$pattern, $imm)>; def : Pat<(i32 (op GPR32:$Rdn, (i32 (trunc (opcnt (sve_pred_enum:$pattern)))))), @@ -962,12 +970,12 @@ multiclass sve_int_pred_pattern_a<bits<3> opc, string asm, GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, 1), sub_32))>; - def : Pat<(i32 (op GPR32:$Rdn, (mul (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_mul_imm i32:$imm)))), + def : Pat<(i32 (op GPR32:$Rdn, (mul (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_mul_imm_i32 i32:$imm)))), (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm), sub_32))>; - def : Pat<(i32 (op GPR32:$Rdn, (shl (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (i64 (sve_cnt_shl_imm i32:$imm))))), + def : Pat<(i32 (op GPR32:$Rdn, (shl (i32 (trunc (opcnt (sve_pred_enum:$pattern)))), (sve_cnt_shl_imm i32:$imm)))), (i32 (EXTRACT_SUBREG (!cast<Instruction>(NAME) (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rdn, sub_32), sve_pred_enum:$pattern, $imm), sub_32))>; @@ -4324,10 +4332,10 @@ multiclass sve_int_arith_imm1<bits<2> opc, string asm, SDPatternOperator op> { def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> { @@ -4348,10 +4356,10 @@ multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> { def _S : sve_int_arith_imm<0b10, 0b110000, asm, ZPR32, simm8>; def _D : sve_int_arith_imm<0b11, 0b110000, asm, ZPR64, simm8>; - def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>; - def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>; - def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>; - def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _B)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _H)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithSImmPat32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Imm_Arith_All_Active<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithSImmPat64, !cast<Instruction>(NAME # _D)>; } //===----------------------------------------------------------------------===// @@ -4542,7 +4550,7 @@ multiclass sve_int_dup_imm_pred_merge_inst< (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>; def : Pat<(intty (vselect predty:$Pg, - (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))), + (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))), intty:$Zd)), (!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>; } @@ -4580,7 +4588,7 @@ multiclass sve_int_dup_imm_pred_zero_inst< (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>; def : Pat<(intty (vselect predty:$Pg, - (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))), + (intty (AArch64dup (scalarty (SVE8BitLslImm<scalarty>.Pat i32:$imm, i32:$shift)))), (intty (AArch64dup (scalarty 0))))), (!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>; } @@ -6476,14 +6484,14 @@ multiclass sve_int_perm_rev_revh<string asm, SDPatternOperator op> { def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>; def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>; - def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; - def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>; + def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } multiclass sve_int_perm_rev_revw<string asm, SDPatternOperator op> { def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>; - def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; + def : SVE_1_Op_Passthru_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>; } class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty, @@ -8377,13 +8385,13 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter } /// Addressing modes -def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>; -def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>; +def am_sve_indexed_s4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>; +def am_sve_indexed_s6 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>; -def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>; -def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>; -def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>; -def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>; +def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", []>; +def am_sve_regreg_lsl1 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<1>", []>; +def am_sve_regreg_lsl2 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<2>", []>; +def am_sve_regreg_lsl3 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<3>", []>; // Predicated pseudo floating point two operand instructions. multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> { diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index e72dccdc4b78..642080a0d40d 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -152,7 +152,7 @@ bool SVEIntrinsicOpts::coalescePTrueIntrinsicCalls( // Remove the most encompassing ptrue, as well as any promoted ptrues, leaving // behind only the ptrues to be coalesced. PTrues.remove(MostEncompassingPTrue); - PTrues.remove_if([](auto *PTrue) { return isPTruePromoted(PTrue); }); + PTrues.remove_if(isPTruePromoted); // Hoist MostEncompassingPTrue to the start of the basic block. It is always // safe to do this, since ptrue intrinsic calls are guaranteed to have no @@ -287,10 +287,10 @@ bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) { if (!Attr.isValid()) return false; - unsigned MinVScale, MaxVScale; - std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs(); + unsigned MinVScale = Attr.getVScaleRangeMin(); + Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax(); // The transform needs to know the exact runtime length of scalable vectors - if (MinVScale != MaxVScale || MinVScale == 0) + if (!MaxVScale || MinVScale != MaxVScale) return false; auto *PredType = @@ -351,10 +351,10 @@ bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) { if (!Attr.isValid()) return false; - unsigned MinVScale, MaxVScale; - std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs(); + unsigned MinVScale = Attr.getVScaleRangeMin(); + Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax(); // The transform needs to know the exact runtime length of scalable vectors - if (MinVScale != MaxVScale || MinVScale == 0) + if (!MaxVScale || MinVScale != MaxVScale) return false; auto *PredType = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp index aab76d27ef11..d28f38e42430 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -173,14 +173,7 @@ constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() { const AMDGPUFunctionArgInfo & AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const { auto I = ArgInfoMap.find(&F); - if (I == ArgInfoMap.end()) { - if (AMDGPUTargetMachine::EnableFixedFunctionABI) - return FixedABIFunctionInfo; - - // Without the fixed ABI, we assume no function has special inputs. - assert(F.isDeclaration()); - return ExternFunctionInfo; - } - + if (I == ArgInfoMap.end()) + return FixedABIFunctionInfo; return I->second; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index f0aadab3302f..b4ebc7d7d75f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -112,6 +112,17 @@ static bool isDSAddress(const Constant *C) { return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; } +/// Returns true if the function requires the implicit argument be passed +/// regardless of the function contents. +static bool funcRequiresImplicitArgPtr(const Function &F) { + // Sanitizers require the hostcall buffer passed in the implicit arguments. + return F.hasFnAttribute(Attribute::SanitizeAddress) || + F.hasFnAttribute(Attribute::SanitizeThread) || + F.hasFnAttribute(Attribute::SanitizeMemory) || + F.hasFnAttribute(Attribute::SanitizeHWAddress) || + F.hasFnAttribute(Attribute::SanitizeMemTag); +} + namespace { class AMDGPUInformationCache : public InformationCache { public: @@ -296,7 +307,7 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize { bool AllCallSitesKnown = true; if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) - indicatePessimisticFixpoint(); + return indicatePessimisticFixpoint(); return Change; } @@ -339,7 +350,17 @@ struct AAAMDAttributesFunction : public AAAMDAttributes { void initialize(Attributor &A) override { Function *F = getAssociatedFunction(); + + // If the function requires the implicit arg pointer due to sanitizers, + // assume it's needed even if explicitly marked as not requiring it. + const bool NeedsImplicit = funcRequiresImplicitArgPtr(*F); + if (NeedsImplicit) + removeAssumedBits(IMPLICIT_ARG_PTR); + for (auto Attr : ImplicitAttrs) { + if (NeedsImplicit && Attr.first == IMPLICIT_ARG_PTR) + continue; + if (F->hasFnAttribute(Attr.second)) addKnownBits(Attr.first); } @@ -500,6 +521,9 @@ struct AAAMDFlatWorkGroupSize std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F); intersectKnown( ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1))); + + if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) + indicatePessimisticFixpoint(); } ChangeStatus updateImpl(Attributor &A) override { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 43928d7c2a09..2f1e7823f65c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -652,8 +652,8 @@ bool AMDGPUCallLowering::lowerFormalArguments( ++PSInputNum; if (SkipArg) { - for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) - B.buildUndef(VRegs[Idx][I]); + for (Register R : VRegs[Idx]) + B.buildUndef(R); ++Idx; continue; @@ -715,10 +715,9 @@ bool AMDGPUCallLowering::lowerFormalArguments( if (!MBB.empty()) B.setInstr(*MBB.begin()); - if (!IsEntryFunc) { + if (!IsEntryFunc && !IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. - if (AMDGPUTargetMachine::EnableFixedFunctionABI) - TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); + TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } IncomingValueAssigner Assigner(AssignFn); @@ -731,11 +730,6 @@ bool AMDGPUCallLowering::lowerFormalArguments( uint64_t StackOffset = Assigner.StackOffset; - if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { - // Special inputs come after user arguments. - TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); - } - // Start adding system SGPRs. if (IsEntryFunc) { TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); @@ -829,9 +823,12 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, if (IncomingArg) { LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); - } else { - assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); + } else { + // We may have proven the input wasn't needed, although the ABI is + // requiring it. We just need to allocate the register appropriately. + MIRBuilder.buildUndef(InputReg); } if (OutgoingArg->isRegister()) { @@ -1233,8 +1230,7 @@ bool AMDGPUCallLowering::lowerTailCall( // after the ordinary user argument registers. SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; - if (AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; @@ -1300,12 +1296,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const SITargetLowering &TLI = *getTLI<SITargetLowering>(); const DataLayout &DL = F.getParent()->getDataLayout(); - if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { - LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); - return false; - } - SmallVector<ArgInfo, 8> OutArgs; for (auto &OrigArg : Info.OrigArgs) splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv); @@ -1359,8 +1349,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // after the ordinary user argument registers. SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; - if (AMDGPUTargetMachine::EnableFixedFunctionABI && - Info.CallConv != CallingConv::AMDGPU_Gfx) { + if (Info.CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index c7c5ff7bcbe7..2415fdfecaae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -64,6 +64,30 @@ def int_minmax_to_med3 : GICombineRule< [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; +def fp_minmax_to_med3 : GICombineRule< + (defs root:$min_or_max, med3_matchdata:$matchinfo), + (match (wip_match_opcode G_FMAXNUM, + G_FMINNUM, + G_FMAXNUM_IEEE, + G_FMINNUM_IEEE):$min_or_max, + [{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; + +def fp_minmax_to_clamp : GICombineRule< + (defs root:$min_or_max, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_FMAXNUM, + G_FMINNUM, + G_FMAXNUM_IEEE, + G_FMINNUM_IEEE):$min_or_max, + [{ return RegBankHelper.matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyClamp(*${min_or_max}, ${matchinfo}); }])>; + +def fmed3_intrinsic_to_clamp : GICombineRule< + (defs root:$fmed3, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_INTRINSIC):$fmed3, + [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>; + def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">; def remove_fcanonicalize : GICombineRule< @@ -102,7 +126,9 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> { + "AMDGPUGenRegBankCombinerHelper", + [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, + fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; let StateClass = "AMDGPURegBankCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp index 301e6f6d6f42..e79ff9b597c9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -378,5 +378,4 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, } MI.eraseFromParent(); - return; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 12cef2774aaf..7fd94a977be7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -172,6 +172,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>; def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>; def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>; def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>; +def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>; +def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>; def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>; def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b9c59f4c615a..699c6c479455 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -280,11 +280,12 @@ void MetadataStreamerV2::emitKernelAttrs(const Function &Func) { } } -void MetadataStreamerV2::emitKernelArgs(const Function &Func) { +void MetadataStreamerV2::emitKernelArgs(const Function &Func, + const GCNSubtarget &ST) { for (auto &Arg : Func.args()) emitKernelArg(Arg); - emitHiddenKernelArgs(Func); + emitHiddenKernelArgs(Func, ST); } void MetadataStreamerV2::emitKernelArg(const Argument &Arg) { @@ -381,10 +382,9 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty, } } -void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) { - int HiddenArgNumBytes = - getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); - +void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func, + const GCNSubtarget &ST) { + unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func); if (!HiddenArgNumBytes) return; @@ -465,11 +465,12 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF, HSAMetadata.mKernels.push_back(Kernel::Metadata()); auto &Kernel = HSAMetadata.mKernels.back(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); Kernel.mName = std::string(Func.getName()); Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str(); emitKernelLanguage(Func); emitKernelAttrs(Func); - emitKernelArgs(Func); + emitKernelArgs(Func, ST); HSAMetadata.mKernels.back().mCodeProps = CodeProps; HSAMetadata.mKernels.back().mDebugProps = DebugProps; } @@ -673,13 +674,14 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func, } void MetadataStreamerV3::emitKernelArgs(const Function &Func, + const GCNSubtarget &ST, msgpack::MapDocNode Kern) { unsigned Offset = 0; auto Args = HSAMetadataDoc->getArrayNode(); for (auto &Arg : Func.args()) emitKernelArg(Arg, Offset, Args); - emitHiddenKernelArgs(Func, Offset, Args); + emitHiddenKernelArgs(Func, ST, Offset, Args); Kern[".args"] = Args; } @@ -791,11 +793,10 @@ void MetadataStreamerV3::emitKernelArg( } void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func, + const GCNSubtarget &ST, unsigned &Offset, msgpack::ArrayDocNode Args) { - int HiddenArgNumBytes = - getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0); - + unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func); if (!HiddenArgNumBytes) return; @@ -912,6 +913,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF, const SIProgramInfo &ProgramInfo) { auto &Func = MF.getFunction(); auto Kern = getHSAKernelProps(MF, ProgramInfo); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL || Func.getCallingConv() == CallingConv::SPIR_KERNEL); @@ -925,7 +927,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF, (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true); emitKernelLanguage(Func, Kern); emitKernelAttrs(Func, Kern); - emitKernelArgs(Func, Kern); + emitKernelArgs(Func, ST, Kern); } Kernels.push_back(Kern); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h index af5dae1cd8c0..54ed0afbba6d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h @@ -30,6 +30,7 @@ class MDNode; class Module; struct SIProgramInfo; class Type; +class GCNSubtarget; namespace AMDGPU { @@ -86,7 +87,8 @@ protected: void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern); - void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern); + void emitKernelArgs(const Function &Func, const GCNSubtarget &ST, + msgpack::MapDocNode Kern); void emitKernelArg(const Argument &Arg, unsigned &Offset, msgpack::ArrayDocNode Args); @@ -98,8 +100,8 @@ protected: StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); - void emitHiddenKernelArgs(const Function &Func, unsigned &Offset, - msgpack::ArrayDocNode Args); + void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST, + unsigned &Offset, msgpack::ArrayDocNode Args); msgpack::DocNode &getRootMetadata(StringRef Key) { return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key]; @@ -173,7 +175,7 @@ private: void emitKernelAttrs(const Function &Func); - void emitKernelArgs(const Function &Func); + void emitKernelArgs(const Function &Func, const GCNSubtarget &ST); void emitKernelArg(const Argument &Arg); @@ -183,7 +185,7 @@ private: StringRef BaseTypeName = "", StringRef AccQual = "", StringRef TypeQual = ""); - void emitHiddenKernelArgs(const Function &Func); + void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST); const Metadata &getHSAMetadata() const { return HSAMetadata; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 88b4ec53a2a0..db84b8766924 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -892,6 +892,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } break; } + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: { + if (isa<UndefValue>(II.getArgOperand(0))) + return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + + if (isa<ConstantPointerNull>(II.getArgOperand(0))) + return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); + break; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 1f898f2ba8b3..5046daaed977 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -533,7 +533,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16, V2S16}) .minScalar(0, S16) - .clampMaxNumElements(0, S16, 2) + .clampMaxNumElementsStrict(0, S16, 2) .widenScalarToNextMultipleOf(0, 32) .maxScalar(0, S32) .scalarize(0); @@ -541,7 +541,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) .legalFor({S32, S16, V2S16}) // Clamp modifier .minScalarOrElt(0, S16) - .clampMaxNumElements(0, S16, 2) + .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .widenScalarToNextPow2(0, 32) .lower(); @@ -712,7 +712,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } if (ST.hasVOP3PInsts()) - FPOpActions.clampMaxNumElements(0, S16, 2); + FPOpActions.clampMaxNumElementsStrict(0, S16, 2); FPOpActions .scalarize(0) @@ -728,7 +728,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_FNEG, G_FABS}) .legalFor(FPTypesPK16) - .clampMaxNumElements(0, S16, 2) + .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .clampScalar(0, S16, S64); @@ -965,7 +965,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.has16BitInsts()) { getActionDefinitionsBuilder(G_BSWAP) .legalFor({S16, S32, V2S16}) - .clampMaxNumElements(0, S16, 2) + .clampMaxNumElementsStrict(0, S16, 2) // FIXME: Fixing non-power-of-2 before clamp is workaround for // narrowScalar limitation. .widenScalarToNextPow2(0) @@ -1052,10 +1052,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Split vector extloads. unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); - unsigned AlignBits = Query.MMODescrs[0].AlignInBits; - - if (MemSize < DstTy.getSizeInBits()) - MemSize = std::max(MemSize, AlignBits); if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) return true; @@ -1077,12 +1073,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return true; } - if (AlignBits < MemSize) { - const SITargetLowering *TLI = ST.getTargetLowering(); - return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, - Align(AlignBits / 8)); - } - return false; }; @@ -1176,20 +1166,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (DstSize > MemSize) return std::make_pair(0, LLT::scalar(MemSize)); - if (!isPowerOf2_32(DstSize)) { - // We're probably decomposing an odd sized store. Try to split - // to the widest type. TODO: Account for alignment. As-is it - // should be OK, since the new parts will be further legalized. - unsigned FloorSize = PowerOf2Floor(DstSize); - return std::make_pair(0, LLT::scalar(FloorSize)); - } - - if (DstSize > 32 && (DstSize % 32 != 0)) { - // FIXME: Need a way to specify non-extload of larger size if - // suitably aligned. - return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); - } - unsigned MaxSize = maxSizeForAddrSpace(ST, PtrTy.getAddressSpace(), Op == G_LOAD); @@ -1257,14 +1233,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, ElementCount::getFixed(FloorSize / EltSize), EltTy)); } - // Need to split because of alignment. - unsigned Align = Query.MMODescrs[0].AlignInBits; - if (EltSize > Align && - (EltSize / Align < DstTy.getNumElements())) { - return std::make_pair( - 0, LLT::fixed_vector(EltSize / Align, EltTy)); - } - // May need relegalization for the scalars. return std::make_pair(0, EltTy); }) @@ -1457,6 +1425,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: Doesn't handle extract of illegal sizes. getActionDefinitionsBuilder(Op) .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) + .lowerIf([=](const LegalityQuery &Query) { + // Sub-vector(or single element) insert and extract. + // TODO: verify immediate offset here since lower only works with + // whole elements. + const LLT BigTy = Query.Types[BigTyIdx]; + return BigTy.isVector(); + }) // FIXME: Multiples of 16 should not be legal. .legalIf([=](const LegalityQuery &Query) { const LLT BigTy = Query.Types[BigTyIdx]; @@ -1615,7 +1590,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Prefer to reduce vector widths for 16-bit vectors before lowering, to // get more vector shift opportunities, since we'll get those when // expanded. - .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); + .clampMaxNumElementsStrict(0, S16, 2); } else if (ST.has16BitInsts()) { SextInReg.lowerFor({{S32}, {S64}, {S16}}); } else { @@ -1637,14 +1612,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_FSHR) .legalFor({{S32, S32}}) .lowerFor({{V2S16, V2S16}}) - .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) + .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .lower(); if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder(G_FSHL) .lowerFor({{V2S16, V2S16}}) - .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) + .clampMaxNumElementsStrict(0, S16, 2) .scalarize(0) .lower(); } else { @@ -2567,10 +2542,8 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, } else { // For cases where the widened type isn't a nice register value, unmerge // from a widened register (e.g. <3 x s16> -> <4 x s16>) - B.setInsertPt(B.getMBB(), ++B.getInsertPt()); - WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); - B.setInsertPt(B.getMBB(), MI.getIterator()); - B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); + WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); + B.buildDeleteTrailingVectorElements(ValReg, WideLoad); } } @@ -3843,6 +3816,10 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, llvm_unreachable("invalid data type"); } + if (StoreVT == LLT::fixed_vector(3, S16)) { + Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) + .getReg(0); + } return Reg; } @@ -4237,8 +4214,17 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || (I >= Intr->CoordStart && !IsA16)) { // Handle any gradient or coordinate operands that should not be packed - AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); - PackedAddrs.push_back(AddrReg); + if ((I < Intr->GradientStart) && IsA16 && + (B.getMRI()->getType(AddrReg) == S16)) { + // Special handling of bias when A16 is on. Bias is of type half but + // occupies full 32-bit. + PackedAddrs.push_back( + B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) + .getReg(0)); + } else { + AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); + PackedAddrs.push_back(AddrReg); + } } else { // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, // derivatives dx/dh and dx/dv are packed with undef. @@ -4676,9 +4662,23 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // Deal with the one annoying legal case. const LLT V3S16 = LLT::fixed_vector(3, 16); if (Ty == V3S16) { - padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); - auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); - B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); + if (IsTFE) { + if (ResultRegs.size() == 1) { + NewResultReg = ResultRegs[0]; + } else if (ResultRegs.size() == 2) { + LLT V4S16 = LLT::fixed_vector(4, 16); + NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); + } else { + return false; + } + } + + if (MRI->getType(DstReg).getNumElements() < + MRI->getType(NewResultReg).getNumElements()) { + B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); + } else { + B.buildPadVectorWithUndefElements(DstReg, NewResultReg); + } return true; } @@ -4869,8 +4869,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, } Ops.push_back(RayExtent); - auto packLanes = [&Ops, &S32, &B] (Register Src) { - auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); + auto packLanes = [&Ops, &S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); Ops.push_back(Unmerge.getReg(0)); Ops.push_back(Unmerge.getReg(1)); Ops.push_back(Unmerge.getReg(2)); @@ -4878,8 +4878,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, packLanes(RayOrigin); if (IsA16) { - auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); - auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); Register R1 = MRI.createGenericVirtualRegister(S32); Register R2 = MRI.createGenericVirtualRegister(S32); Register R3 = MRI.createGenericVirtualRegister(S32); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 12d6d35a6917..6e2b5dc471bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -24,13 +24,6 @@ // A possible future refinement is to specialise the structure per-kernel, so // that fields can be elided based on more expensive analysis. // -// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct -// type which would cause allocating huge memory for struct instance within -// every kernel. Hence, before running this pass, it is advisable to run the -// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within -// non-kernel functions by pointers and thereby minimizes the unnecessary per -// kernel allocation of LDS memory. -// //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -62,6 +55,20 @@ static cl::opt<bool> SuperAlignLDSGlobals( namespace { +SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { + SmallPtrSet<GlobalValue *, 32> UsedList; + + SmallVector<GlobalValue *, 32> TmpVec; + collectUsedGlobalVariables(M, TmpVec, true); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + TmpVec.clear(); + collectUsedGlobalVariables(M, TmpVec, false); + UsedList.insert(TmpVec.begin(), TmpVec.end()); + + return UsedList; +} + class AMDGPULowerModuleLDS : public ModulePass { static void removeFromUsedList(Module &M, StringRef Name, @@ -105,11 +112,9 @@ class AMDGPULowerModuleLDS : public ModulePass { removeFromUsedLists(Module &M, const std::vector<GlobalVariable *> &LocalVars) { SmallPtrSet<Constant *, 32> LocalVarsSet; - for (size_t I = 0; I < LocalVars.size(); I++) { - if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) { + for (GlobalVariable *LocalVar : LocalVars) + if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts())) LocalVarsSet.insert(C); - } - } removeFromUsedList(M, "llvm.used", LocalVarsSet); removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet); } @@ -158,9 +163,9 @@ public: } bool runOnModule(Module &M) override { - UsedList = AMDGPU::getUsedList(M); - - bool Changed = processUsedLDS(M); + UsedList = getUsedList(M); + bool Changed = superAlignLDSGlobals(M); + Changed |= processUsedLDS(M); for (Function &F : M.functions()) { if (F.isDeclaration()) @@ -177,6 +182,50 @@ public: } private: + // Increase the alignment of LDS globals if necessary to maximise the chance + // that we can use aligned LDS instructions to access them. + static bool superAlignLDSGlobals(Module &M) { + const DataLayout &DL = M.getDataLayout(); + bool Changed = false; + if (!SuperAlignLDSGlobals) { + return Changed; + } + + for (auto &GV : M.globals()) { + if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) { + // Only changing alignment of LDS variables + continue; + } + if (!GV.hasInitializer()) { + // cuda/hip extern __shared__ variable, leave alignment alone + continue; + } + + Align Alignment = AMDGPU::getAlign(DL, &GV); + TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType()); + + if (GVSize > 8) { + // We might want to use a b96 or b128 load/store + Alignment = std::max(Alignment, Align(16)); + } else if (GVSize > 4) { + // We might want to use a b64 load/store + Alignment = std::max(Alignment, Align(8)); + } else if (GVSize > 2) { + // We might want to use a b32 load/store + Alignment = std::max(Alignment, Align(4)); + } else if (GVSize > 1) { + // We might want to use a b16 load/store + Alignment = std::max(Alignment, Align(2)); + } + + if (Alignment != AMDGPU::getAlign(DL, &GV)) { + Changed = true; + GV.setAlignment(Alignment); + } + } + return Changed; + } + bool processUsedLDS(Module &M, Function *F = nullptr) { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -190,31 +239,6 @@ private: return false; } - // Increase the alignment of LDS globals if necessary to maximise the chance - // that we can use aligned LDS instructions to access them. - if (SuperAlignLDSGlobals) { - for (auto *GV : FoundLocalVars) { - Align Alignment = AMDGPU::getAlign(DL, GV); - TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType()); - - if (GVSize > 8) { - // We might want to use a b96 or b128 load/store - Alignment = std::max(Alignment, Align(16)); - } else if (GVSize > 4) { - // We might want to use a b64 load/store - Alignment = std::max(Alignment, Align(8)); - } else if (GVSize > 2) { - // We might want to use a b32 load/store - Alignment = std::max(Alignment, Align(4)); - } else if (GVSize > 1) { - // We might want to use a b16 load/store - Alignment = std::max(Alignment, Align(2)); - } - - GV->setAlignment(Alignment); - } - } - SmallVector<OptimizedStructLayoutField, 8> LayoutFields; LayoutFields.reserve(FoundLocalVars.size()); for (GlobalVariable *GV : FoundLocalVars) { @@ -343,20 +367,14 @@ private: refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias); } - // Mark kernels with asm that reads the address of the allocated structure - // This is not necessary for lowering. This lets other passes, specifically - // PromoteAlloca, accurately calculate how much LDS will be used by the - // kernel after lowering. + // This ensures the variable is allocated when called functions access it. + // It also lets other passes, specifically PromoteAlloca, accurately + // calculate how much LDS will be used by the kernel after lowering. if (!F) { IRBuilder<> Builder(Ctx); - SmallPtrSet<Function *, 32> Kernels; for (Function &Func : M.functions()) { - if (Func.isDeclaration()) - continue; - - if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) { + if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) { markUsedByKernel(Builder, &Func, SGV); - Kernels.insert(&Func); } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp index 5d4b007f11e6..4e2f98d2a5db 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -2786,12 +2786,8 @@ AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned Selec // Fixme: Move linearization creation to the original spot createLinearizedRegion(Region, SelectOut); - for (auto CI = Region->getChildren()->begin(), - CE = Region->getChildren()->end(); - CI != CE; ++CI) { - InnerSelectOut = - initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII); - } + for (auto *CI : *Region->getChildren()) + InnerSelectOut = initializeSelectRegisters(CI, InnerSelectOut, MRI, TII); MRT->setBBSelectRegIn(InnerSelectOut); return InnerSelectOut; } else { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 2aa02299ecdc..8ad344816ad2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -119,31 +119,27 @@ private: bool isConstantAddr(const Value *V) const; }; -static const Value *getMemoryInstrPtr(const Instruction *Inst) { - if (auto LI = dyn_cast<LoadInst>(Inst)) { - return LI->getPointerOperand(); - } - if (auto SI = dyn_cast<StoreInst>(Inst)) { - return SI->getPointerOperand(); - } - if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { - return AI->getPointerOperand(); - } - if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { - return AI->getPointerOperand(); - } - if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { - return MI->getRawDest(); - } +static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType( + const Instruction *Inst) { + if (auto LI = dyn_cast<LoadInst>(Inst)) + return {LI->getPointerOperand(), LI->getType()}; + if (auto SI = dyn_cast<StoreInst>(Inst)) + return {SI->getPointerOperand(), SI->getValueOperand()->getType()}; + if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) + return {AI->getPointerOperand(), AI->getCompareOperand()->getType()}; + if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) + return {AI->getPointerOperand(), AI->getValOperand()->getType()}; + if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) + return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())}; - return nullptr; + return {nullptr, nullptr}; } bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); SmallSet<const Value *, 32> WorkSet; SmallSet<const Value *, 32> Visited; - if (const Value *MO = getMemoryInstrPtr(Inst)) { + if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { if (isGlobalAddr(MO)) WorkSet.insert(MO); } @@ -209,10 +205,8 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { for (auto &B : F) { LastAccess = MemAccessInfo(); for (auto &I : B) { - if (const Value *Ptr = getMemoryInstrPtr(&I)) { - unsigned Size = divideCeil( - Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), - 32); + if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { + unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); if (isIndirectAccess(&I)) FI.IAMInstCost += Size; if (isLargeStride(&I)) @@ -326,7 +320,7 @@ bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { AMDGPUPerfHint::MemAccessInfo AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { MemAccessInfo MAI; - const Value *MO = getMemoryInstrPtr(Inst); + const Value *MO = getMemoryInstrPtrAndType(Inst).first; LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); // Do not treat local-addr memory access as large stride. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 3ec5dd7e0eff..f9a9fe403ff6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -939,7 +939,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(MaybeAlign(I.getAlignment())); + GV->setAlignment(I.getAlign()); Value *TCntY, *TCntZ; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 12b5830ef930..3ce67a733c10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -16,6 +16,7 @@ #include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -23,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-regbank-combiner" @@ -36,13 +38,15 @@ protected: MachineRegisterInfo &MRI; const RegisterBankInfo &RBI; const TargetRegisterInfo &TRI; + const SIInstrInfo &TII; CombinerHelper &Helper; public: AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) : B(B), MF(B.getMF()), MRI(*B.getMRI()), RBI(*MF.getSubtarget().getRegBankInfo()), - TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; + TRI(*MF.getSubtarget().getRegisterInfo()), + TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){}; bool isVgprRegBank(Register Reg); Register getAsVgpr(Register Reg); @@ -63,7 +67,19 @@ public: Register &Val, CstTy &K0, CstTy &K1); bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg); + bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg); void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + void applyClamp(MachineInstr &MI, Register &Reg); + +private: + AMDGPU::SIModeRegisterDefaults getMode(); + bool getIEEE(); + bool getDX10Clamp(); + bool isFminnumIeee(const MachineInstr &MI); + bool isFCst(MachineInstr *MI); + bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1); }; bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { @@ -98,6 +114,13 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { case AMDGPU::G_UMAX: case AMDGPU::G_UMIN: return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM: + return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3}; + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FMINNUM_IEEE: + return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE, + AMDGPU::G_AMDGPU_FMED3}; } } @@ -148,6 +171,146 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( return true; } +// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) +// ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K +// ieee = false : min/max(NaN, K) = K +// clamp(NaN) = dx10_clamp ? 0.0 : NaN +// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input. +// Other operand commutes (see matchMed) give same result since min and max are +// commutative. + +// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1 +// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0. +// Val = SNaN only for ieee = true +// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1 +// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1 +// max(min(SNaN, K1), K0) = max(K1, K0) = K1 +// Val = NaN,ieee = false or Val = QNaN,ieee = true +// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0 +// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true) +// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0 +bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( + MachineInstr &MI, Med3MatchInfo &MatchInfo) { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32)) + return false; + + auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); + + Register Val; + Optional<FPValueAndVReg> K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. + if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + if (K0->Value > K1->Value) + return false; + + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to + // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner + // nodes(max/min) have same behavior when one input is NaN and other isn't. + // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN, + // also post-legalizer inputs to min/max are fcanonicalized (never SNaN). + if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) { + // Don't fold single use constant that can't be inlined. + if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) && + (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) { + MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg}; + return true; + } + } + + return false; +} + +bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI, + Register &Reg) { + // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16). + auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); + Register Val; + Optional<FPValueAndVReg> K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). + if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0)) + return false; + + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates + // to 0.0 requires dx10_clamp = true. + if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) && + isKnownNeverSNaN(Val, MRI)) || + isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) { + Reg = Val; + return true; + } + + return false; +} + +// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true. +// Val = SNaN only for ieee = true. It is important which operand is NaN. +// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0 +// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0 +// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN +// Val = NaN,ieee = false or Val = QNaN,ieee = true +// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0 +// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0 +// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0 +bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI, + Register &Reg) { + if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3) + return false; + + // In llvm-ir, clamp is often represented as an intrinsic call to + // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders. + MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); + MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); + MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + + if (isFCst(Src0) && !isFCst(Src1)) + std::swap(Src0, Src1); + if (isFCst(Src1) && !isFCst(Src2)) + std::swap(Src1, Src2); + if (isFCst(Src0) && !isFCst(Src1)) + std::swap(Src0, Src1); + if (!isClampZeroToOne(Src1, Src2)) + return false; + + Register Val = Src0->getOperand(0).getReg(); + + auto isOp3Zero = [&]() { + MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT) + return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0); + return false; + }; + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold + // when Val could be QNaN. If Val can also be SNaN third input should be 0.0. + if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) || + (getIEEE() && getDX10Clamp() && + (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) { + Reg = Val; + return true; + } + + return false; +} + +void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) { + B.setInstrAndDebugLoc(MI); + B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg}, + MI.getFlags()); + MI.eraseFromParent(); +} + void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) { B.setInstrAndDebugLoc(MI); @@ -158,6 +321,33 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, MI.eraseFromParent(); } +AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { + return MF.getInfo<SIMachineFunctionInfo>()->getMode(); +} + +bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; } + +bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; } + +bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) { + return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE; +} + +bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::G_FCONSTANT; +} + +bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0, + MachineInstr *K1) { + if (isFCst(K0) && isFCst(K1)) { + const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm(); + const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm(); + return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) || + (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0)); + } + return false; +} + class AMDGPURegBankCombinerHelperState { protected: CombinerHelper &Helper; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 5988403c0a29..c60012bcfe2e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -707,9 +707,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( iterator_range<MachineBasicBlock::iterator> Range, SmallSet<Register, 4> &SGPROperandRegs, MachineRegisterInfo &MRI) const { - SmallVector<Register, 4> ResultRegs; - SmallVector<Register, 4> InitResultRegs; - SmallVector<Register, 4> PhiRegs; // Track use registers which have already been expanded with a readfirstlane // sequence. This may have multiple uses if moving a sequence. @@ -774,15 +771,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addReg(NewExec) .addMBB(LoopBB); - for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { - B.buildInstr(TargetOpcode::G_PHI) - .addDef(std::get<2>(Result)) - .addReg(std::get<0>(Result)) // Initial value / implicit_def - .addMBB(&MBB) - .addReg(std::get<1>(Result)) // Mid-loop value. - .addMBB(LoopBB); - } - const DebugLoc &DL = B.getDL(); MachineInstr &FirstInst = *Range.begin(); @@ -1174,18 +1162,25 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, // 96-bit loads are only available for vector loads. We need to split this // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). if (MMO->getAlign() < Align(16)) { + MachineFunction *MF = MI.getParent()->getParent(); + ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); + MachineIRBuilder B(MI, ApplyBank); + LegalizerHelper Helper(*MF, ApplyBank, B); LLT Part64, Part32; std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); - auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); - auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); - - auto Undef = B.buildUndef(LoadTy); - auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); - B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); + if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) != + LegalizerHelper::Legalized) + return false; + return true; } else { LLT WiderTy = widen96To128(LoadTy); auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); - B.buildExtract(MI.getOperand(0), WideLoad, 0); + if (WiderTy.isScalar()) + B.buildTrunc(MI.getOperand(0), WideLoad); + else { + B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(), + WideLoad); + } } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp index d55bf3917e9c..2475b44b42a3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -87,6 +87,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" +#include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" @@ -110,6 +111,18 @@ using namespace llvm; namespace { +namespace AMDGPU { +/// Collect all the instructions where user \p U belongs to. \p U could be +/// instruction itself or it could be a constant expression which is used within +/// an instruction. If \p CollectKernelInsts is true, collect instructions only +/// from kernels, otherwise collect instructions only from non-kernel functions. +DenseMap<Function *, SmallPtrSet<Instruction *, 8>> +getFunctionToInstsMap(User *U, bool CollectKernelInsts); + +SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV); + +} // namespace AMDGPU + class ReplaceLDSUseImpl { Module &M; LLVMContext &Ctx; @@ -127,7 +140,8 @@ class ReplaceLDSUseImpl { // Collect LDS which requires their uses to be replaced by pointer. std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() { // Collect LDS which requires module lowering. - std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M); + std::vector<GlobalVariable *> LDSGlobals = + llvm::AMDGPU::findVariablesToLower(M); // Remove LDS which don't qualify for replacement. llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) { @@ -172,7 +186,7 @@ class ReplaceLDSUseImpl { AMDGPUAS::LOCAL_ADDRESS); LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer)); + LDSPointer->setAlignment(llvm::AMDGPU::getAlign(DL, LDSPointer)); // Mark that an associated LDS pointer is created for LDS. LDSToPointer[GV] = LDSPointer; @@ -245,10 +259,9 @@ class ReplaceLDSUseImpl { auto FunctionToInsts = AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/); - for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end(); - FI != FE; ++FI) { - Function *F = FI->first; - auto &Insts = FI->second; + for (const auto &FunctionToInst : FunctionToInsts) { + Function *F = FunctionToInst.first; + auto &Insts = FunctionToInst.second; for (auto *I : Insts) { // If `U` is a constant expression, then we need to break the // associated instruction into a set of separate instructions by @@ -341,10 +354,9 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { // Traverse through each kernel K, check and if required, initialize the // LDS pointer to point to LDS within K. - for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE; - ++KI) { - Function *K = KI->first; - SmallPtrSet<Function *, 8> Callees = KI->second; + for (const auto &KernelToCallee : KernelToCallees) { + Function *K = KernelToCallee.first; + SmallPtrSet<Function *, 8> Callees = KernelToCallee.second; // Compute reachable and LDS used callees for kernel K. set_intersect(Callees, LDSAccessors); @@ -378,6 +390,184 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) { return true; } +namespace AMDGPU { + +// An helper class for collecting all reachable callees for each kernel defined +// within the module. +class CollectReachableCallees { + Module &M; + CallGraph CG; + SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions; + + // Collect all address taken functions within the module. + void collectAddressTakenFunctions() { + auto *ECNode = CG.getExternalCallingNode(); + + for (const auto &GI : *ECNode) { + auto *CGN = GI.second; + auto *F = CGN->getFunction(); + if (!F || F->isDeclaration() || llvm::AMDGPU::isKernelCC(F)) + continue; + AddressTakenFunctions.insert(CGN); + } + } + + // For given kernel, collect all its reachable non-kernel functions. + SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) { + SmallPtrSet<Function *, 8> ReachableCallees; + + // Call graph node which represents this kernel. + auto *KCGN = CG[K]; + + // Go through all call graph nodes reachable from the node representing this + // kernel, visit all their call sites, if the call site is direct, add + // corresponding callee to reachable callee set, if it is indirect, resolve + // the indirect call site to potential reachable callees, add them to + // reachable callee set, and repeat the process for the newly added + // potential callee nodes. + // + // FIXME: Need to handle bit-casted function pointers. + // + SmallVector<CallGraphNode *, 8> CGNStack(depth_first(KCGN)); + SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes; + while (!CGNStack.empty()) { + auto *CGN = CGNStack.pop_back_val(); + + if (!VisitedCGNodes.insert(CGN).second) + continue; + + // Ignore call graph node which does not have associated function or + // associated function is not a definition. + if (!CGN->getFunction() || CGN->getFunction()->isDeclaration()) + continue; + + for (const auto &GI : *CGN) { + auto *RCB = cast<CallBase>(GI.first.getValue()); + auto *RCGN = GI.second; + + if (auto *DCallee = RCGN->getFunction()) { + ReachableCallees.insert(DCallee); + } else if (RCB->isIndirectCall()) { + auto *RCBFTy = RCB->getFunctionType(); + for (auto *ACGN : AddressTakenFunctions) { + auto *ACallee = ACGN->getFunction(); + if (ACallee->getFunctionType() == RCBFTy) { + ReachableCallees.insert(ACallee); + CGNStack.append(df_begin(ACGN), df_end(ACGN)); + } + } + } + } + } + + return ReachableCallees; + } + +public: + explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { + // Collect address taken functions. + collectAddressTakenFunctions(); + } + + void collectReachableCallees( + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { + // Collect reachable callee set for each kernel defined in the module. + for (Function &F : M.functions()) { + if (!llvm::AMDGPU::isKernelCC(&F)) + continue; + Function *K = &F; + KernelToCallees[K] = collectReachableCallees(K); + } + } +}; + +/// Collect reachable callees for each kernel defined in the module \p M and +/// return collected callees at \p KernelToCallees. +void collectReachableCallees( + Module &M, + DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { + CollectReachableCallees CRC{M}; + CRC.collectReachableCallees(KernelToCallees); +} + +/// For the given LDS global \p GV, visit all its users and collect all +/// non-kernel functions within which \p GV is used and return collected list of +/// such non-kernel functions. +SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { + SmallPtrSet<Function *, 8> LDSAccessors; + SmallVector<User *, 8> UserStack(GV->users()); + SmallPtrSet<User *, 8> VisitedUsers; + + while (!UserStack.empty()) { + auto *U = UserStack.pop_back_val(); + + // `U` is already visited? continue to next one. + if (!VisitedUsers.insert(U).second) + continue; + + // `U` is a global variable which is initialized with LDS. Ignore LDS. + if (isa<GlobalValue>(U)) + return SmallPtrSet<Function *, 8>(); + + // Recursively explore constant users. + if (isa<Constant>(U)) { + append_range(UserStack, U->users()); + continue; + } + + // `U` should be an instruction, if it belongs to a non-kernel function F, + // then collect F. + Function *F = cast<Instruction>(U)->getFunction(); + if (!llvm::AMDGPU::isKernelCC(F)) + LDSAccessors.insert(F); + } + + return LDSAccessors; +} + +DenseMap<Function *, SmallPtrSet<Instruction *, 8>> +getFunctionToInstsMap(User *U, bool CollectKernelInsts) { + DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts; + SmallVector<User *, 8> UserStack; + SmallPtrSet<User *, 8> VisitedUsers; + + UserStack.push_back(U); + + while (!UserStack.empty()) { + auto *UU = UserStack.pop_back_val(); + + if (!VisitedUsers.insert(UU).second) + continue; + + if (isa<GlobalValue>(UU)) + continue; + + if (isa<Constant>(UU)) { + append_range(UserStack, UU->users()); + continue; + } + + auto *I = cast<Instruction>(UU); + Function *F = I->getFunction(); + if (CollectKernelInsts) { + if (!llvm::AMDGPU::isKernelCC(F)) { + continue; + } + } else { + if (llvm::AMDGPU::isKernelCC(F)) { + continue; + } + } + + FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>())); + FunctionToInsts[F].insert(I); + } + + return FunctionToInsts; +} + +} // namespace AMDGPU + // Entry-point function which interface ReplaceLDSUseImpl with outside of the // class. bool ReplaceLDSUseImpl::replaceLDSUse() { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 0655b4342ba1..cd05797fdbdb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -413,21 +413,21 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { case AMDGPU::V_MAX_I16_e32: case AMDGPU::V_MIN_I16_e64: case AMDGPU::V_MIN_I16_e32: + case AMDGPU::V_MAD_F16_e64: + case AMDGPU::V_MAD_U16_e64: + case AMDGPU::V_MAD_I16_e64: + case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_DIV_FIXUP_F16_e64: // On gfx10, all 16-bit instructions preserve the high bits. return getGeneration() <= AMDGPUSubtarget::GFX9; - case AMDGPU::V_MAD_F16_e64: case AMDGPU::V_MADAK_F16: case AMDGPU::V_MADMK_F16: case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_MAC_F16_e32: case AMDGPU::V_FMAMK_F16: case AMDGPU::V_FMAAK_F16: - case AMDGPU::V_MAD_U16_e64: - case AMDGPU::V_MAD_I16_e64: - case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMAC_F16_e64: case AMDGPU::V_FMAC_F16_e32: - case AMDGPU::V_DIV_FIXUP_F16_e64: // In gfx9, the preferred handling of the unused high 16-bits changed. Most // instructions maintain the legacy behavior of 0ing. Some instructions // changed to preserving the high bits. @@ -648,9 +648,18 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { } unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const { + assert(AMDGPU::isKernel(F.getCallingConv())); + + // We don't allocate the segment if we know the implicit arguments weren't + // used, even if the ABI implies we need them. + if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr")) + return 0; + if (isMesaKernel(F)) return 16; - return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); + + // Assume all implicit inputs are used by default + return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56); } uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index de11676279f2..a2c61f9da8da 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -231,13 +231,6 @@ static cl::opt<bool, true> LateCFGStructurize( cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG), cl::Hidden); -static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt( - "amdgpu-fixed-function-abi", - cl::desc("Enable all implicit function arguments"), - cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI), - cl::init(false), - cl::Hidden); - // Enable lib calls simplifications static cl::opt<bool> EnableLibCallSimplify( "amdgpu-simplify-libcall", @@ -505,7 +498,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; bool AMDGPUTargetMachine::EnableFunctionCalls = false; -bool AMDGPUTargetMachine::EnableFixedFunctionABI = false; bool AMDGPUTargetMachine::EnableLowerModuleLDS = true; AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 0ff2db2a52d9..226646a96953 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -37,7 +37,6 @@ protected: public: static bool EnableLateStructurizeCFG; static bool EnableFunctionCalls; - static bool EnableFixedFunctionABI; static bool EnableLowerModuleLDS; AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index ecdbdf613a53..09c5eb192e1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -519,57 +519,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, const Instruction *CxtI) { - EVT OrigTy = TLI->getValueType(DL, Ty); - if (!OrigTy.isSimple()) { - // FIXME: We're having to query the throughput cost so that the basic - // implementation tries to generate legalize and scalarization costs. Maybe - // we could hoist the scalarization code here? - if (CostKind != TTI::TCK_CodeSize) - return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput, - Opd1Info, Opd2Info, Opd1PropInfo, - Opd2PropInfo, Args, CxtI); - // Scalarization - - // Check if any of the operands are vector operands. - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Invalid opcode"); - - std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); - - bool IsFloat = Ty->isFPOrFPVectorTy(); - // Assume that floating point arithmetic operations cost twice as much as - // integer operations. - unsigned OpCost = (IsFloat ? 2 : 1); - - if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { - // The operation is legal. Assume it costs 1. - // TODO: Once we have extract/insert subvector cost we need to use them. - return LT.first * OpCost; - } - - if (!TLI->isOperationExpand(ISD, LT.second)) { - // If the operation is custom lowered, then assume that the code is twice - // as expensive. - return LT.first * 2 * OpCost; - } - - // Else, assume that we need to scalarize this op. - // TODO: If one of the types get legalized by splitting, handle this - // similarly to what getCastInstrCost() does. - if (auto *VTy = dyn_cast<VectorType>(Ty)) { - unsigned Num = cast<FixedVectorType>(VTy)->getNumElements(); - InstructionCost Cost = getArithmeticInstrCost( - Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo, Args, CxtI); - // Return the cost of multiple scalar invocation plus the cost of - // inserting and extracting the values. - SmallVector<Type *> Tys(Args.size(), Ty); - return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; - } - - // We don't know anything about this scalar instruction. - return OpCost; - } // Legalize the type. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); @@ -742,40 +691,6 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return BaseT::getIntrinsicInstrCost(ICA, CostKind); Type *RetTy = ICA.getReturnType(); - EVT OrigTy = TLI->getValueType(DL, RetTy); - if (!OrigTy.isSimple()) { - if (CostKind != TTI::TCK_CodeSize) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); - - // TODO: Combine these two logic paths. - if (ICA.isTypeBasedOnly()) - return getTypeBasedIntrinsicInstrCost(ICA, CostKind); - - unsigned RetVF = - (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements() - : 1); - const IntrinsicInst *I = ICA.getInst(); - const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); - FastMathFlags FMF = ICA.getFlags(); - // Assume that we need to scalarize this intrinsic. - - // Compute the scalarization overhead based on Args for a vector - // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while - // CostModel will pass a vector RetTy and VF is 1. - InstructionCost ScalarizationCost = InstructionCost::getInvalid(); - if (RetVF > 1) { - ScalarizationCost = 0; - if (!RetTy->isVoidTy()) - ScalarizationCost += - getScalarizationOverhead(cast<VectorType>(RetTy), true, false); - ScalarizationCost += - getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); - } - - IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I, - ScalarizationCost); - return getIntrinsicInstrCost(Attrs, CostKind); - } // Legalize the type. std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); diff --git a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 712f6dece911..1736c078eb83 100644 --- a/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -173,10 +173,8 @@ protected: } static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) { - for (MachineLoop::iterator iter = LoopInfo.begin(), - iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) { - (*iter)->print(dbgs()); - } + for (const MachineLoop *L : LoopInfo) + L->print(dbgs()); } // UTILITY FUNCTIONS @@ -691,9 +689,7 @@ bool AMDGPUCFGStructurizer::prepare() { SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks; // Add an ExitBlk to loop that don't have one - for (MachineLoopInfo::iterator It = MLI->begin(), - E = MLI->end(); It != E; ++It) { - MachineLoop *LoopRep = (*It); + for (MachineLoop *LoopRep : *MLI) { MBBVector ExitingMBBs; LoopRep->getExitingBlocks(ExitingMBBs); @@ -827,14 +823,13 @@ bool AMDGPUCFGStructurizer::run() { wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); // Detach retired Block, release memory. - for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); - It != E; ++It) { - if ((*It).second && (*It).second->IsRetired) { - assert(((*It).first)->getNumber() != -1); - LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";); - (*It).first->eraseFromParent(); //Remove from the parent Function. + for (auto &It : BlockInfoMap) { + if (It.second && It.second->IsRetired) { + assert((It.first)->getNumber() != -1); + LLVM_DEBUG(dbgs() << "Erase BB" << (It.first)->getNumber() << "\n";); + It.first->eraseFromParent(); // Remove from the parent Function. } - delete (*It).second; + delete It.second; } BlockInfoMap.clear(); LLInfoMap.clear(); diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 4acd77a9d5d2..2bb59086f391 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -246,8 +246,12 @@ public: return isRegKind() && !hasModifiers(); } + bool isRegOrInline(unsigned RCID, MVT type) const { + return isRegClass(RCID) || isInlinableImm(type); + } + bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const { - return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type); + return isRegOrInline(RCID, type) || isLiteralImm(type); } bool isRegOrImmWithInt16InputMods() const { @@ -372,7 +376,7 @@ public: bool isInlineValue() const; bool isRegOrInlineNoMods(unsigned RCID, MVT type) const { - return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers(); + return isRegOrInline(RCID, type) && !hasModifiers(); } bool isSCSrcB16() const { diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index d3644db7cf8b..a535c8cc0918 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// -def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">; -def MUBUFOffset : ComplexPattern<i64, 3, "SelectMUBUFOffset">; +def MUBUFAddr64 : ComplexPattern<iPTR, 4, "SelectMUBUFAddr64">; +def MUBUFOffset : ComplexPattern<iPTR, 3, "SelectMUBUFOffset">; -def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; -def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; +def MUBUFScratchOffen : ComplexPattern<iPTR, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>; +def MUBUFScratchOffset : ComplexPattern<iPTR, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>; def BUFAddrKind { int Offset = 0; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index bb0aa648ff90..c7ec5308e6d0 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -6,12 +6,12 @@ // //===----------------------------------------------------------------------===// -def FlatOffset : ComplexPattern<i64, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>; -def GlobalOffset : ComplexPattern<i64, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>; -def ScratchOffset : ComplexPattern<i32, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>; +def FlatOffset : ComplexPattern<iPTR, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>; +def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>; +def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>; -def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>; -def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>; +def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>; +def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>; //===----------------------------------------------------------------------===// // FLAT classes diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index f3f664f7972a..912bcc792e4d 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -120,8 +120,7 @@ unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, // We will now look at each of the currently executing instructions // to find out if this wait instruction still needs to wait. - for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) { - const InstRef &PrevIR = *I; + for (const InstRef &PrevIR : IssuedInst) { const Instruction &PrevInst = *PrevIR.getInstruction(); const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 29c37c706138..8a48a67b829c 100644 --- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -440,9 +440,8 @@ private: CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount); - for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { - BB->splice(InsertPos, BB, Clause.second[i]); - } + for (MachineInstr *MI : Clause.second) + BB->splice(InsertPos, BB, MI); CfCount += 2 * Clause.second.size(); } @@ -452,9 +451,8 @@ private: CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount); - for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { - BB->splice(InsertPos, BB, Clause.second[i]); - } + for (MachineInstr *MI : Clause.second) + BB->splice(InsertPos, BB, MI); CfCount += Clause.second.size(); } @@ -635,10 +633,10 @@ public: CfCount++; } MI->eraseFromParent(); - for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) - EmitFetchClause(I, DL, FetchClauses[i], CfCount); - for (unsigned i = 0, e = AluClauses.size(); i < e; i++) - EmitALUClause(I, DL, AluClauses[i], CfCount); + for (ClauseFile &CF : FetchClauses) + EmitFetchClause(I, DL, CF, CfCount); + for (ClauseFile &CF : AluClauses) + EmitALUClause(I, DL, CF, CfCount); break; } default: @@ -649,8 +647,7 @@ public: break; } } - for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { - MachineInstr *Alu = ToPopAfter[i]; + for (MachineInstr *Alu : ToPopAfter) { BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), TII->get(R600::CF_ALU_POP_AFTER)) .addImm(Alu->getOperand(0).getImm()) diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index a7ebf72315cb..aec8b1ae4837 100644 --- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -268,17 +268,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { {R600::OpName::src1_W, R600::OpName::src1_sel_W}, }; - for (unsigned j = 0; j < 8; j++) { - MachineOperand &MO = - MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0])); + for (const auto &Op : OpTable) { + MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[0])); Register Reg = MO.getReg(); if (Reg == R600::ALU_CONST) { MachineOperand &Sel = - MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); + MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1])); Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } - } return Result; } @@ -289,15 +287,14 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { {R600::OpName::src2, R600::OpName::src2_sel}, }; - for (unsigned j = 0; j < 3; j++) { - int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]); + for (const auto &Op : OpTable) { + int SrcIdx = getOperandIdx(MI.getOpcode(), Op[0]); if (SrcIdx < 0) break; MachineOperand &MO = MI.getOperand(SrcIdx); Register Reg = MO.getReg(); if (Reg == R600::ALU_CONST) { - MachineOperand &Sel = - MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); + MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1])); Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } @@ -521,12 +518,11 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, ValidSwizzle.clear(); unsigned ConstCount; BankSwizzle TransBS = ALU_VEC_012_SCL_210; - for (unsigned i = 0, e = IG.size(); i < e; ++i) { - IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount)); - unsigned Op = getOperandIdx(IG[i]->getOpcode(), - R600::OpName::bank_swizzle); - ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) - IG[i]->getOperand(Op).getImm()); + for (MachineInstr *MI : IG) { + IGSrcs.push_back(ExtractSrcs(*MI, PV, ConstCount)); + unsigned Op = getOperandIdx(MI->getOpcode(), R600::OpName::bank_swizzle); + ValidSwizzle.push_back( + (R600InstrInfo::BankSwizzle)MI->getOperand(Op).getImm()); } std::vector<std::pair<int, unsigned>> TransOps; if (!isLastAluTrans) @@ -542,8 +538,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG, ALU_VEC_120_SCL_212, ALU_VEC_102_SCL_221 }; - for (unsigned i = 0; i < 4; i++) { - TransBS = TransSwz[i]; + for (R600InstrInfo::BankSwizzle TransBS : TransSwz) { if (!isConstCompatible(TransBS, TransOps, ConstCount)) continue; bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, @@ -562,9 +557,9 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts) const { assert (Consts.size() <= 12 && "Too many operands in instructions group"); unsigned Pair1 = 0, Pair2 = 0; - for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - unsigned ReadConstHalf = Consts[i] & 2; - unsigned ReadConstIndex = Consts[i] & (~3); + for (unsigned Const : Consts) { + unsigned ReadConstHalf = Const & 2; + unsigned ReadConstIndex = Const & (~3); unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf; if (!Pair1) { Pair1 = ReadHalfConst; @@ -587,12 +582,11 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs) const { std::vector<unsigned> Consts; SmallSet<int64_t, 4> Literals; - for (unsigned i = 0, n = MIs.size(); i < n; i++) { - MachineInstr &MI = *MIs[i]; - if (!isALUInstr(MI.getOpcode())) + for (MachineInstr *MI : MIs) { + if (!isALUInstr(MI->getOpcode())) continue; - for (const auto &Src : getSrcs(MI)) { + for (const auto &Src : getSrcs(*MI)) { if (Src.first->getReg() == R600::ALU_LITERAL_X) Literals.insert(Src.second); if (Literals.size() > 4) @@ -1330,11 +1324,11 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel)) .setReg(MO.getReg()); - for (unsigned i = 0; i < 14; i++) { + for (unsigned Operand : Operands) { MachineOperand &MO = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); + getOperandIdx(MI->getOpcode(), getSlotedOps(Operand, Slot))); assert (MO.isImm()); - setImmOperand(*MIB, Operands[i], MO.getImm()); + setImmOperand(*MIB, Operand, MO.getImm()); } MIB->getOperand(20).setImm(0); return MIB; diff --git a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp index 6aee2f591b56..d26879ed8d60 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -328,9 +328,9 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) { void R600SchedStrategy::LoadAlu() { std::vector<SUnit *> &QSrc = Pending[IDAlu]; - for (unsigned i = 0, e = QSrc.size(); i < e; ++i) { - AluKind AK = getAluKind(QSrc[i]); - AvailableAlus[AK].push_back(QSrc[i]); + for (SUnit *SU : QSrc) { + AluKind AK = getAluKind(SU); + AvailableAlus[AK].push_back(SU); } QSrc.clear(); } diff --git a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp index ac6a3581e255..aa156190b7ae 100644 --- a/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp +++ b/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp @@ -307,8 +307,8 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass { // Build new MDNode. SmallVector<Metadata *, 6> KernelMDArgs; KernelMDArgs.push_back(ConstantAsMetadata::get(NewF)); - for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) - KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i])); + for (const MDVector &MDV : NewArgMDs.ArgVector) + KernelMDArgs.push_back(MDNode::get(*Context, MDV)); MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs); return std::make_tuple(NewF, NewMDNode); diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 72cf48c04e7f..795bc898a7bf 100644 --- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -150,19 +150,18 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap) const { unsigned CurrentUndexIdx = 0; - for (DenseMap<Register, unsigned>::iterator It = ToMerge->RegToChan.begin(), - E = ToMerge->RegToChan.end(); It != E; ++It) { + for (auto &It : ToMerge->RegToChan) { DenseMap<Register, unsigned>::const_iterator PosInUntouched = - Untouched->RegToChan.find((*It).first); + Untouched->RegToChan.find(It.first); if (PosInUntouched != Untouched->RegToChan.end()) { - Remap.push_back(std::pair<unsigned, unsigned> - ((*It).second, (*PosInUntouched).second)); + Remap.push_back( + std::pair<unsigned, unsigned>(It.second, (*PosInUntouched).second)); continue; } if (CurrentUndexIdx >= Untouched->UndefReg.size()) return false; - Remap.push_back(std::pair<unsigned, unsigned> - ((*It).second, Untouched->UndefReg[CurrentUndexIdx++])); + Remap.push_back(std::pair<unsigned, unsigned>( + It.second, Untouched->UndefReg[CurrentUndexIdx++])); } return true; @@ -172,9 +171,9 @@ static unsigned getReassignedChan( const std::vector<std::pair<unsigned, unsigned>> &RemapChan, unsigned Chan) { - for (unsigned j = 0, je = RemapChan.size(); j < je; j++) { - if (RemapChan[j].first == Chan) - return RemapChan[j].second; + for (const auto &J : RemapChan) { + if (J.first == Chan) + return J.second; } llvm_unreachable("Chan wasn't reassigned"); } @@ -190,11 +189,10 @@ MachineInstr *R600VectorRegMerger::RebuildVector( Register SrcVec = BaseRSI->Instr->getOperand(0).getReg(); DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan; std::vector<Register> UpdatedUndef = BaseRSI->UndefReg; - for (DenseMap<Register, unsigned>::iterator It = RSI->RegToChan.begin(), - E = RSI->RegToChan.end(); It != E; ++It) { + for (const auto &It : RSI->RegToChan) { Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); - unsigned SubReg = (*It).first; - unsigned Swizzle = (*It).second; + unsigned SubReg = It.first; + unsigned Swizzle = It.second; unsigned Chan = getReassignedChan(RemapChan, Swizzle); MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG), @@ -234,14 +232,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector( } void R600VectorRegMerger::RemoveMI(MachineInstr *MI) { - for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(), - E = PreviousRegSeqByReg.end(); It != E; ++It) { - std::vector<MachineInstr *> &MIs = (*It).second; + for (auto &It : PreviousRegSeqByReg) { + std::vector<MachineInstr *> &MIs = It.second; MIs.erase(llvm::find(MIs, MI), MIs.end()); } - for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(), - E = PreviousRegSeqByUndefCount.end(); It != E; ++It) { - std::vector<MachineInstr *> &MIs = (*It).second; + for (auto &It : PreviousRegSeqByUndefCount) { + std::vector<MachineInstr *> &MIs = It.second; MIs.erase(llvm::find(MIs, MI), MIs.end()); } } @@ -255,9 +251,9 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI, Offset = 3; for (unsigned i = 0; i < 4; i++) { unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1; - for (unsigned j = 0, e = RemapChan.size(); j < e; j++) { - if (RemapChan[j].first == Swizzle) { - MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1); + for (const auto &J : RemapChan) { + if (J.first == Swizzle) { + MI.getOperand(i + Offset).setImm(J.second - 1); break; } } diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index beb0aad86e89..fbe2a1cd9fba 100644 --- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -127,8 +127,8 @@ private: R600::OpName::src1, R600::OpName::src2 }; - for (unsigned i = 0; i < 3; i++) { - int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); + for (unsigned Op : Ops) { + int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Op); if (OperandIdx < 0) continue; Register Src = MI.getOperand(OperandIdx).getReg(); diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp index 99a1a8e9871a..c329bae50f92 100644 --- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -54,10 +54,8 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE); reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR); - for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(), - E = R600::R600_AddrRegClass.end(); I != E; ++I) { - reserveRegisterTuples(Reserved, *I); - } + for (MCPhysReg R : R600::R600_AddrRegClass) + reserveRegisterTuples(Reserved, R); TII->reserveIndirectRegisters(Reserved, MF, *this); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 200e00ee5521..1f93284fc7ee 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1620,7 +1620,7 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) { // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users, // in which case we can erase them all later in runOnMachineFunction. if (MRI->use_nodbg_empty(MI.getOperand(0).getReg())) - MI.eraseFromParentAndMarkDBGValuesForRemoval(); + MI.eraseFromParent(); return true; } @@ -1821,7 +1821,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) { auto &SrcOp = InstToErase->getOperand(1); auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register(); - InstToErase->eraseFromParentAndMarkDBGValuesForRemoval(); + InstToErase->eraseFromParent(); InstToErase = nullptr; if (!SrcReg || SrcReg.isPhysical()) break; @@ -1831,7 +1831,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { } if (InstToErase && InstToErase->isRegSequence() && MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) - InstToErase->eraseFromParentAndMarkDBGValuesForRemoval(); + InstToErase->eraseFromParent(); } } return true; diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 4706c74be721..d4fe74ecb96e 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1167,11 +1167,13 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( if (SpillVGPRToAGPR) { // To track the spill frame indices handled in this pass. BitVector SpillFIs(MFI.getObjectIndexEnd(), false); + BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false); bool SeenDbgInstr = false; for (MachineBasicBlock &MBB : MF) { for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + int FrameIndex; if (MI.isDebugInstr()) SeenDbgInstr = true; @@ -1191,10 +1193,18 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( SpillFIs.set(FI); continue; } - } + } else if (TII->isStoreToStackSlot(MI, FrameIndex) || + TII->isLoadFromStackSlot(MI, FrameIndex)) + NonVGPRSpillFIs.set(FrameIndex); } } + // Stack slot coloring may assign different objets to the same stack slot. + // If not, then the VGPR to AGPR spill slot is dead. + for (unsigned FI : SpillFIs.set_bits()) + if (!NonVGPRSpillFIs.test(FI)) + FuncInfo->setVGPRToAGPRSpillDead(FI); + for (MachineBasicBlock &MBB : MF) { for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs()) MBB.addLiveIn(Reg); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 35b72f5d201b..9f138136e6e9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" @@ -2062,33 +2063,30 @@ void SITargetLowering::allocateSpecialInputSGPRs( SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); - // We need to allocate these in place regardless of their use. - const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI; - // TODO: Unify handling with private memory pointers. - if (IsFixed || Info.hasDispatchPtr()) + if (Info.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); - if (IsFixed || Info.hasQueuePtr()) + if (Info.hasQueuePtr()) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a // constant offset from the kernarg segment. - if (IsFixed || Info.hasImplicitArgPtr()) + if (Info.hasImplicitArgPtr()) allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); - if (IsFixed || Info.hasDispatchID()) + if (Info.hasDispatchID()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); // flat_scratch_init is not applicable for non-kernel functions. - if (IsFixed || Info.hasWorkGroupIDX()) + if (Info.hasWorkGroupIDX()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); - if (IsFixed || Info.hasWorkGroupIDY()) + if (Info.hasWorkGroupIDY()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); - if (IsFixed || Info.hasWorkGroupIDZ()) + if (Info.hasWorkGroupIDZ()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); } @@ -2419,10 +2417,9 @@ SDValue SITargetLowering::LowerFormalArguments( if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); - } else { + } else if (!IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. - if (AMDGPUTargetMachine::EnableFixedFunctionABI) - allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); + allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } if (IsKernel) { @@ -2549,17 +2546,13 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { - // Special inputs come after user arguments. - allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); - } - // Start adding system SGPRs. if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); + if (!IsGraphics) + allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } auto &ArgUsageInfo = @@ -3123,8 +3116,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (AMDGPUTargetMachine::EnableFixedFunctionABI && - CallConv != CallingConv::AMDGPU_Gfx) { + if (CallConv != CallingConv::AMDGPU_Gfx) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3263,12 +3255,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } } - if (!AMDGPUTargetMachine::EnableFixedFunctionABI && - CallConv != CallingConv::AMDGPU_Gfx) { - // Copy special input registers after user input arguments. - passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); - } - if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); @@ -6282,10 +6268,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } - // Push back extra arguments. - for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) - VAddrs.push_back(Op.getOperand(ArgOffset + I)); - // Check for 16 bit addresses or derivatives and pack if true. MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); @@ -6298,6 +6280,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + // Push back extra arguments. + for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { + if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { + // Special handling of bias when A16 is on. Bias is of type half but + // occupies full 32-bit. + SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)}); + VAddrs.push_back(bias); + } else + VAddrs.push_back(Op.getOperand(ArgOffset + I)); + } + if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { // 16 bit gradients are supported, but are tied to the A16 control // so both gradients and addresses must be 16 bit @@ -7502,8 +7495,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, assert(NodePtr.getValueType() == MVT::i32 || NodePtr.getValueType() == MVT::i64); - assert(RayDir.getValueType() == MVT::v4f16 || - RayDir.getValueType() == MVT::v4f32); + assert(RayDir.getValueType() == MVT::v3f16 || + RayDir.getValueType() == MVT::v3f32); if (!Subtarget->hasGFX10_AEncoding()) { emitRemovedIntrinsicError(DAG, DL, Op.getValueType()); @@ -9837,11 +9830,13 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, if (Opcode == AMDGPU::G_FCANONICALIZE) return true; - if (Opcode == AMDGPU::G_FCONSTANT) { - auto F = MI->getOperand(1).getFPImm()->getValueAPF(); - if (F.isNaN() && F.isSignaling()) + Optional<FPValueAndVReg> FCR; + // Constant splat (can be padded with undef) or scalar constant. + if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) { + if (FCR->Value.isSignaling()) return false; - return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF); + return !FCR->Value.isDenormal() || + denormalsEnabledForType(MRI.getType(FCR->VReg), MF); } if (MaxDepth == 0) @@ -11514,7 +11509,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // Prefer VGPRs over AGPRs in mAI instructions where possible. // This saves a chain-copy of registers and better ballance register // use between vgpr and agpr as agpr tuples tend to be big. - if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { + if (MI.getDesc().OpInfo) { unsigned Opc = MI.getOpcode(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), @@ -12477,6 +12472,6 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, if (Size <= 256) return Cost; - Cost.first = (Size + 255) / 256; + Cost.first += (Size + 255) / 256; return Cost; } diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index c9d9dd1fb82c..6fbe5d45ce0a 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -30,6 +30,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Sequence.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" @@ -51,26 +52,6 @@ static cl::opt<bool> ForceEmitZeroFlag( cl::init(false), cl::Hidden); namespace { - -template <typename EnumT> -class enum_iterator - : public iterator_facade_base<enum_iterator<EnumT>, - std::forward_iterator_tag, const EnumT> { - EnumT Value; -public: - enum_iterator() = default; - enum_iterator(EnumT Value) : Value(Value) {} - - enum_iterator &operator++() { - Value = static_cast<EnumT>(Value + 1); - return *this; - } - - bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; } - - EnumT operator*() const { return Value; } -}; - // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether // s_waitcnt instruction needs to be emitted. @@ -78,27 +59,32 @@ public: #define CNT_MASK(t) (1u << (t)) enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS }; +} // namespace -iterator_range<enum_iterator<InstCounterType>> inst_counter_types() { - return make_range(enum_iterator<InstCounterType>(VM_CNT), - enum_iterator<InstCounterType>(NUM_INST_CNTS)); -} +namespace llvm { +template <> struct enum_iteration_traits<InstCounterType> { + static constexpr bool is_iterable = true; +}; +} // namespace llvm + +namespace { +auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); } using RegInterval = std::pair<int, int>; -struct { +struct HardwareLimits { unsigned VmcntMax; unsigned ExpcntMax; unsigned LgkmcntMax; unsigned VscntMax; -} HardwareLimits; +}; -struct { +struct RegisterEncoding { unsigned VGPR0; unsigned VGPRL; unsigned SGPR0; unsigned SGPRL; -} RegisterEncoding; +}; enum WaitEventType { VMEM_ACCESS, // vector-memory read & write @@ -194,18 +180,20 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) { // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {} + WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits, + RegisterEncoding Encoding) + : ST(SubTarget), Limits(Limits), Encoding(Encoding) {} - static unsigned getWaitCountMax(InstCounterType T) { + unsigned getWaitCountMax(InstCounterType T) const { switch (T) { case VM_CNT: - return HardwareLimits.VmcntMax; + return Limits.VmcntMax; case LGKM_CNT: - return HardwareLimits.LgkmcntMax; + return Limits.LgkmcntMax; case EXP_CNT: - return HardwareLimits.ExpcntMax; + return Limits.ExpcntMax; case VS_CNT: - return HardwareLimits.VscntMax; + return Limits.VscntMax; default: break; } @@ -338,6 +326,8 @@ private: unsigned OpNo, unsigned Val); const GCNSubtarget *ST = nullptr; + HardwareLimits Limits = {}; + RegisterEncoding Encoding = {}; unsigned ScoreLBs[NUM_INST_CNTS] = {0}; unsigned ScoreUBs[NUM_INST_CNTS] = {0}; unsigned PendingEvents = 0; @@ -471,14 +461,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)); if (TRI->isVectorRegister(*MRI, Op.getReg())) { - assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL); - Result.first = Reg - RegisterEncoding.VGPR0; + assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL); + Result.first = Reg - Encoding.VGPR0; if (TRI->isAGPR(*MRI, Op.getReg())) Result.first += AGPR_OFFSET; assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS); } else if (TRI->isSGPRReg(*MRI, Op.getReg())) { - assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); - Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS; + assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS); + Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS; assert(Result.first >= NUM_ALL_VGPRS && Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS); } @@ -1589,20 +1579,22 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); - HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); - HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); - HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0; + HardwareLimits Limits = {}; + Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV); + Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); + Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); + Limits.VscntMax = ST->hasVscnt() ? 63 : 0; unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(); unsigned NumSGPRsMax = ST->getAddressableNumSGPRs(); assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS); assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS); - RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); - RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1; - RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); - RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1; + RegisterEncoding Encoding = {}; + Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0); + Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1; + Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0); + Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1; TrackedWaitcntSet.clear(); BlockInfos.clear(); @@ -1652,9 +1644,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) - Brackets = std::make_unique<WaitcntBrackets>(ST); + Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding); else - *Brackets = WaitcntBrackets(ST); + *Brackets = WaitcntBrackets(ST, Limits, Encoding); } Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); @@ -1686,45 +1678,47 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } while (Repeat); - SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; - - bool HaveScalarStores = false; + if (ST->hasScalarStores()) { + SmallVector<MachineBasicBlock *, 4> EndPgmBlocks; + bool HaveScalarStores = false; - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (!HaveScalarStores && TII->isScalarStore(MI)) - HaveScalarStores = true; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!HaveScalarStores && TII->isScalarStore(MI)) + HaveScalarStores = true; - if (MI.getOpcode() == AMDGPU::S_ENDPGM || - MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) - EndPgmBlocks.push_back(&MBB); + if (MI.getOpcode() == AMDGPU::S_ENDPGM || + MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) + EndPgmBlocks.push_back(&MBB); + } } - } - if (HaveScalarStores) { - // If scalar writes are used, the cache must be flushed or else the next - // wave to reuse the same scratch memory can be clobbered. - // - // Insert s_dcache_wb at wave termination points if there were any scalar - // stores, and only if the cache hasn't already been flushed. This could be - // improved by looking across blocks for flushes in postdominating blocks - // from the stores but an explicitly requested flush is probably very rare. - for (MachineBasicBlock *MBB : EndPgmBlocks) { - bool SeenDCacheWB = false; + if (HaveScalarStores) { + // If scalar writes are used, the cache must be flushed or else the next + // wave to reuse the same scratch memory can be clobbered. + // + // Insert s_dcache_wb at wave termination points if there were any scalar + // stores, and only if the cache hasn't already been flushed. This could + // be improved by looking across blocks for flushes in postdominating + // blocks from the stores but an explicitly requested flush is probably + // very rare. + for (MachineBasicBlock *MBB : EndPgmBlocks) { + bool SeenDCacheWB = false; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; - ++I) { - if (I->getOpcode() == AMDGPU::S_DCACHE_WB) - SeenDCacheWB = true; - else if (TII->isScalarStore(*I)) - SeenDCacheWB = false; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); + I != E; ++I) { + if (I->getOpcode() == AMDGPU::S_DCACHE_WB) + SeenDCacheWB = true; + else if (TII->isScalarStore(*I)) + SeenDCacheWB = false; - // FIXME: It would be better to insert this before a waitcnt if any. - if ((I->getOpcode() == AMDGPU::S_ENDPGM || - I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && - !SeenDCacheWB) { - Modified = true; - BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + // FIXME: It would be better to insert this before a waitcnt if any. + if ((I->getOpcode() == AMDGPU::S_ENDPGM || + I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && + !SeenDCacheWB) { + Modified = true; + BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB)); + } } } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 92f5322b8ad2..1755b93538ce 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -899,8 +899,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isAGPRClass(RC)) { - Opcode = (RI.hasVGPRs(SrcRC)) ? - AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; + if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC)) + Opcode = AMDGPU::V_ACCVGPR_MOV_B32; + else if (RI.hasVGPRs(SrcRC)) + Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64; + else + Opcode = AMDGPU::INSTRUCTION_LIST_END; } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && @@ -1417,6 +1421,33 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) { } } +static unsigned getAVSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_AV32_SAVE; + case 8: + return AMDGPU::SI_SPILL_AV64_SAVE; + case 12: + return AMDGPU::SI_SPILL_AV96_SAVE; + case 16: + return AMDGPU::SI_SPILL_AV128_SAVE; + case 20: + return AMDGPU::SI_SPILL_AV160_SAVE; + case 24: + return AMDGPU::SI_SPILL_AV192_SAVE; + case 28: + return AMDGPU::SI_SPILL_AV224_SAVE; + case 32: + return AMDGPU::SI_SPILL_AV256_SAVE; + case 64: + return AMDGPU::SI_SPILL_AV512_SAVE; + case 128: + return AMDGPU::SI_SPILL_AV1024_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, @@ -1463,21 +1494,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, return; } - unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) - : getVGPRSpillSaveOpcode(SpillSize); + unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize) + : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) + : getVGPRSpillSaveOpcode(SpillSize); MFI->setHasSpilledVGPRs(); - if (RI.isVectorSuperClass(RC)) { - // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an - // equivalent VGPR register beforehand. Regalloc might want to introduce - // AV spills only to be relevant until rewriter at which they become - // either spills of VGPRs or AGPRs. - Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); - BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg) - .addReg(SrcReg, RegState::Kill); - SrcReg = TmpVReg; - } - BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg, getKillRegState(isKill)) // data .addFrameIndex(FrameIndex) // addr @@ -1567,6 +1588,33 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { } } +static unsigned getAVSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_AV32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_AV64_RESTORE; + case 12: + return AMDGPU::SI_SPILL_AV96_RESTORE; + case 16: + return AMDGPU::SI_SPILL_AV128_RESTORE; + case 20: + return AMDGPU::SI_SPILL_AV160_RESTORE; + case 24: + return AMDGPU::SI_SPILL_AV192_RESTORE; + case 28: + return AMDGPU::SI_SPILL_AV224_RESTORE; + case 32: + return AMDGPU::SI_SPILL_AV256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_AV512_RESTORE; + case 128: + return AMDGPU::SI_SPILL_AV1024_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, @@ -1609,26 +1657,15 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) - : getVGPRSpillRestoreOpcode(SpillSize); - - bool IsVectorSuperClass = RI.isVectorSuperClass(RC); - Register TmpReg = DestReg; - if (IsVectorSuperClass) { - // For AV classes, insert the spill restore to a VGPR followed by a copy - // into an equivalent AV register. - MachineRegisterInfo &MRI = MF->getRegInfo(); - DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); - } + unsigned Opcode = RI.isVectorSuperClass(RC) + ? getAVSpillRestoreOpcode(SpillSize) + : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) + : getVGPRSpillRestoreOpcode(SpillSize); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) // vaddr - .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset - .addImm(0) // offset - .addMemOperand(MMO); - - if (IsVectorSuperClass) - BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg) - .addReg(DestReg, RegState::Kill); + .addFrameIndex(FrameIndex) // vaddr + .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset + .addImm(0) // offset + .addMemOperand(MMO); } void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, @@ -2358,8 +2395,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); auto *ShAmt = MCConstantExpr::create(32, MCCtx); OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); - - return; } unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { @@ -3106,23 +3141,26 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, } static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, - int64_t &Imm) { + int64_t &Imm, MachineInstr **DefMI = nullptr) { if (Reg.isPhysical()) return false; auto *Def = MRI.getUniqueVRegDef(Reg); if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) { Imm = Def->getOperand(1).getImm(); + if (DefMI) + *DefMI = Def; return true; } return false; } -static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) { +static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm, + MachineInstr **DefMI = nullptr) { if (!MO->isReg()) return false; const MachineFunction *MF = MO->getParent()->getParent()->getParent(); const MachineRegisterInfo &MRI = MF->getRegInfo(); - return getFoldableImm(MO->getReg(), MRI, Imm); + return getFoldableImm(MO->getReg(), MRI, Imm, DefMI); } static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, @@ -3195,8 +3233,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { + MachineInstr *DefMI; + const auto killDef = [&DefMI, &MBB, this]() -> void { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + // The only user is the instruction which will be killed. + if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg())) + return; + // We cannot just remove the DefMI here, calling pass will crash. + DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF)); + for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I) + DefMI->RemoveOperand(I); + }; + int64_t Imm; - if (getFoldableImm(Src2, Imm)) { + if (getFoldableImm(Src2, Imm, &DefMI)) { unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32) : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); @@ -3209,13 +3259,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); + killDef(); return MIB; } } unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32) : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); - if (getFoldableImm(Src1, Imm)) { + if (getFoldableImm(Src1, Imm, &DefMI)) { if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) @@ -3225,10 +3276,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); + killDef(); return MIB; } } - if (getFoldableImm(Src0, Imm)) { + if (getFoldableImm(Src0, Imm, &DefMI)) { if (pseudoToMCOpcode(NewOpc) != -1 && isOperandLegal( MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0), @@ -3241,12 +3293,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, updateLiveVariables(LV, MI, *MIB); if (LIS) LIS->ReplaceMachineInstrInMaps(MI, *MIB); + killDef(); return MIB; } } } - unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64 + unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64 : IsF64 ? AMDGPU::V_FMA_F64_e64 : AMDGPU::V_FMA_F32_e64) : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64); @@ -3605,12 +3658,6 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const { const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); // Can't shrink instruction with three operands. - // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add - // a special case for it. It can only be shrunk if the third operand - // is vcc, and src0_modifiers and src1_modifiers are not set. - // We should handle this the same way we handle vopc, by addding - // a register allocation hint pre-regalloc and then do the shrinking - // post-regalloc. if (Src2) { switch (MI.getOpcode()) { default: return false; @@ -4563,8 +4610,9 @@ static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST, unsigned RCID, bool IsAllocatable) { if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) && - (TID.mayLoad() || TID.mayStore() || - (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { + (((TID.mayLoad() || TID.mayStore()) && + !(TID.TSFlags & SIInstrFlags::VGPRSpill)) || + (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) { switch (RCID) { case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID; case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID; @@ -5001,8 +5049,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, --ConstantBusLimit; } - for (unsigned i = 0; i < 3; ++i) { - int Idx = VOP3Idx[i]; + for (int Idx : VOP3Idx) { if (Idx == -1) break; MachineOperand &MO = MI.getOperand(Idx); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 47ee83eb9351..dda92d3d25ff 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1350,11 +1350,11 @@ def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>; // Complex patterns //===----------------------------------------------------------------------===// -def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">; -def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">; -def DS128Bit8ByteAligned : ComplexPattern<i64, 3, "SelectDS128Bit8ByteAligned">; +def DS1Addr1Offset : ComplexPattern<iPTR, 2, "SelectDS1Addr1Offset">; +def DS64Bit4ByteAligned : ComplexPattern<iPTR, 3, "SelectDS64Bit4ByteAligned">; +def DS128Bit8ByteAligned : ComplexPattern<iPTR, 3, "SelectDS128Bit8ByteAligned">; -def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">; +def MOVRELOffset : ComplexPattern<iPTR, 2, "SelectMOVRELOffset">; def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d55d8da8699a..636337ede000 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -761,6 +761,17 @@ defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; +defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; +defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; +defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; +defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; +defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; +defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; +defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; +defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; +defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; +defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; + def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < (outs SReg_64:$dst), (ins si_ga:$ptr_lo, si_ga:$ptr_hi), @@ -2106,6 +2117,19 @@ def : GCNPat < } // end isWave32 def : GCNPat < + (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))), + (V_NOT_B32_e32 $src0) +>; + +def : GCNPat < + (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))), + (REG_SEQUENCE VReg_64, + (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0, + (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1 + ) +>; + +def : GCNPat < (f16 (sint_to_fp i1:$src)), (V_CVT_F16_F32_e32 ( V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), @@ -2188,18 +2212,18 @@ def : GCNPat < >; def : GCNPat < - (i1 (trunc i32:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) + (i1 (DivergentUnaryFrag<trunc> i32:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; def : GCNPat < - (i1 (trunc i16:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) + (i1 (DivergentUnaryFrag<trunc> i16:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; def : GCNPat < - (i1 (trunc i64:$a)), - (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), + (i1 (DivergentUnaryFrag<trunc> i64:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; @@ -2405,21 +2429,37 @@ def : GCNPat < // COPY is workaround tablegen bug from multiple outputs // from S_LSHL_B32's multiple outputs from implicit scc def. def : GCNPat < - (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i16 16)) >; def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))), + (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), + (v2i16 (V_LSHLREV_B32_e64 (i16 16), SReg_32:$src1)) +>; + + +def : GCNPat < + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; def : GCNPat < - (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), + (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1)) +>; + +def : GCNPat < + (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; def : GCNPat < + (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1)) +>; + +def : GCNPat < (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))), (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) >; @@ -2435,42 +2475,74 @@ def : GCNPat < >; def : GCNPat < - (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i32 16)) >; def : GCNPat < - (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))), + (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))), + (v2i16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) +>; + + +def : GCNPat < + (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))), (S_LSHL_B32 SReg_32:$src1, (i32 16)) >; +def : GCNPat < + (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))), + (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) +>; + let SubtargetPredicate = HasVOP3PInsts in { def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))), + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))), (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; +def : GCNPat < + (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))), + (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) +>; + // With multiple uses of the shift, this will duplicate the shift and // increase register pressure. def : GCNPat < - (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) >; +def : GCNPat < + (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1)) +>; + def : GCNPat < - (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), + (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) >; -// TODO: Should source modifiers be matched to v_pack_b32_f16? def : GCNPat < - (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))), + (v2i16 (DivergentBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), + (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), + (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0)))) +>; + +def : GCNPat < + (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))), (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) >; def : GCNPat < + (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))), + (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0)))) +>; + + +def : GCNPat < (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) @@ -2866,6 +2938,18 @@ def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { let hasSideEffects = 0; } +def G_AMDGPU_FMED3 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); + let hasSideEffects = 0; +} + +def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src); + let hasSideEffects = 0; +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index c4007f56f350..3ce368ef4db9 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -62,11 +62,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // calls. const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't - // have any calls. - const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI && - CC != CallingConv::AMDGPU_Gfx && - (!isEntryFunction() || HasCalls); const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; @@ -80,7 +75,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } if (!isEntryFunction()) { - if (UseFixedABI) + if (CC != CallingConv::AMDGPU_Gfx) ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; // TODO: Pick a high register, and shift down, similar to a kernel. @@ -110,20 +105,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) else if (ST.isMesaGfxShader(F)) ImplicitBufferPtr = true; - if (UseFixedABI) { - DispatchPtr = true; - QueuePtr = true; - ImplicitArgPtr = true; - WorkGroupIDX = true; - WorkGroupIDY = true; - WorkGroupIDZ = true; - WorkItemIDX = true; - WorkItemIDY = true; - WorkItemIDZ = true; - - // FIXME: We don't need this? - DispatchID = true; - } else if (!AMDGPU::isGraphics(CC)) { + if (!AMDGPU::isGraphics(CC)) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) WorkGroupIDX = true; @@ -462,7 +444,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) { MFI.setStackID(i, TargetStackID::Default); for (auto &R : VGPRToAGPRSpills) { - if (R.second.FullyAllocated) + if (R.second.IsDead) MFI.RemoveStackObject(R.first); } } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index c305bc20e40d..8accbf611c5f 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -465,6 +465,7 @@ public: struct VGPRSpillToAGPR { SmallVector<MCPhysReg, 32> Lanes; bool FullyAllocated = false; + bool IsDead = false; }; // Map WWM VGPR to a stack slot that is used to save/restore it in the @@ -546,6 +547,12 @@ public: : I->second.Lanes[Lane]; } + void setVGPRToAGPRSpillDead(int FrameIndex) { + auto I = VGPRToAGPRSpills.find(FrameIndex); + if (I != VGPRToAGPRSpills.end()) + I->second.IsDead = true; + } + bool haveFreeLanesForSGPRSpill(const MachineFunction &MF, unsigned NumLane) const; bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI); diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp index 5590d84cc3ab..81db66a98ddf 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -869,29 +869,27 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() { } void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { - unsigned DAGSize = DAG->SUnits.size(); std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; // Every combination of colors given by the top down // and bottom up Reserved node dependency - for (unsigned i = 0, e = DAGSize; i != e; ++i) { - SUnit *SU = &DAG->SUnits[i]; + for (const SUnit &SU : DAG->SUnits) { std::pair<unsigned, unsigned> SUColors; // High latency instructions: already given. - if (CurrentColoring[SU->NodeNum]) + if (CurrentColoring[SU.NodeNum]) continue; - SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; - SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; + SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum]; + SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum]; std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = ColorCombinations.find(SUColors); if (Pos != ColorCombinations.end()) { - CurrentColoring[SU->NodeNum] = Pos->second; + CurrentColoring[SU.NodeNum] = Pos->second; } else { - CurrentColoring[SU->NodeNum] = NextNonReservedID; + CurrentColoring[SU.NodeNum] = NextNonReservedID; ColorCombinations[SUColors] = NextNonReservedID++; } } @@ -1232,15 +1230,13 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria } // Free root and leafs of all blocks to enable scheduling inside them. - for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; + for (SIScheduleBlock *Block : CurrentBlocks) Block->finalizeUnits(); - } - LLVM_DEBUG(dbgs() << "Blocks created:\n\n"; - for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->printDebug(true); - }); + LLVM_DEBUG({ + dbgs() << "Blocks created:\n\n"; + for (SIScheduleBlock *Block : CurrentBlocks) + Block->printDebug(true); + }); } // Two functions taken from Codegen/MachineScheduler.cpp @@ -1379,9 +1375,9 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() { } } - LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { - SIScheduleBlock *Block = CurrentBlocks[i]; - Block->printDebug(true); + LLVM_DEBUG({ + for (SIScheduleBlock *Block : CurrentBlocks) + Block->printDebug(true); }); } @@ -1437,8 +1433,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, // found for several parents, we increment the usage of the one with the // highest topological index. LiveOutRegsNumUsages.resize(Blocks.size()); - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; + for (SIScheduleBlock *Block : Blocks) { for (unsigned Reg : Block->getInRegs()) { bool Found = false; int topoInd = -1; @@ -1502,8 +1497,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, // Fill LiveRegsConsumers for regs that were already // defined before scheduling. - for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { - SIScheduleBlock *Block = Blocks[i]; + for (SIScheduleBlock *Block : Blocks) { for (unsigned Reg : Block->getInRegs()) { bool Found = false; for (SIScheduleBlock* Pred: Block->getPreds()) { @@ -1700,10 +1694,7 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { decreaseLiveRegs(Block, Block->getInRegs()); addLiveRegs(Block->getOutRegs()); releaseBlockSuccs(Block); - for (std::map<unsigned, unsigned>::iterator RegI = - LiveOutRegsNumUsages[Block->getID()].begin(), - E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { - std::pair<unsigned, unsigned> RegP = *RegI; + for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) { // We produce this register, thus it must not be previously alive. assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0); @@ -1759,8 +1750,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, ScheduledBlocks = Scheduler.getBlocks(); - for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { - SIScheduleBlock *Block = ScheduledBlocks[b]; + for (SIScheduleBlock *Block : ScheduledBlocks) { std::vector<SUnit*> SUs = Block->getScheduledUnits(); for (SUnit* SU : SUs) @@ -2000,9 +1990,8 @@ void SIScheduleDAGMI::schedule() assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker"); TopRPTracker.setPos(CurrentTop); - for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), - E = ScheduledSUnits.end(); I != E; ++I) { - SUnit *SU = &SUnits[*I]; + for (unsigned I : ScheduledSUnits) { + SUnit *SU = &SUnits[I]; scheduleMI(SU, true); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index a1d9a23a5084..21aed4ececb5 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -210,6 +210,7 @@ struct SGPRSpillBuilder { auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); if (!TmpVGPRLive) I.addReg(TmpVGPR, RegState::ImplicitDefine); + I->getOperand(2).setIsDead(true); // Mark SCC as dead. TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false); } } @@ -242,9 +243,10 @@ struct SGPRSpillBuilder { TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true, /*IsKill*/ false); auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); - if (!TmpVGPRLive) { + if (!TmpVGPRLive) I.addReg(TmpVGPR, RegState::ImplicitKill); - } + I->getOperand(2).setIsDead(true); // Mark SCC as dead. + // Restore active lanes if (TmpVGPRLive) TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true); @@ -267,9 +269,11 @@ struct SGPRSpillBuilder { TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad, /*IsKill*/ false); // Spill inactive lanes - BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + Not0->getOperand(2).setIsDead(); // Mark SCC as dead. TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad); - BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg); + Not1->getOperand(2).setIsDead(); // Mark SCC as dead. } } @@ -908,6 +912,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V1024_RESTORE: case AMDGPU::SI_SPILL_A1024_SAVE: case AMDGPU::SI_SPILL_A1024_RESTORE: + case AMDGPU::SI_SPILL_AV1024_SAVE: + case AMDGPU::SI_SPILL_AV1024_RESTORE: return 32; case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: @@ -915,6 +921,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V512_RESTORE: case AMDGPU::SI_SPILL_A512_SAVE: case AMDGPU::SI_SPILL_A512_RESTORE: + case AMDGPU::SI_SPILL_AV512_SAVE: + case AMDGPU::SI_SPILL_AV512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: @@ -922,6 +930,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V256_RESTORE: case AMDGPU::SI_SPILL_A256_SAVE: case AMDGPU::SI_SPILL_A256_RESTORE: + case AMDGPU::SI_SPILL_AV256_SAVE: + case AMDGPU::SI_SPILL_AV256_RESTORE: return 8; case AMDGPU::SI_SPILL_S224_SAVE: case AMDGPU::SI_SPILL_S224_RESTORE: @@ -929,6 +939,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V224_RESTORE: case AMDGPU::SI_SPILL_A224_SAVE: case AMDGPU::SI_SPILL_A224_RESTORE: + case AMDGPU::SI_SPILL_AV224_SAVE: + case AMDGPU::SI_SPILL_AV224_RESTORE: return 7; case AMDGPU::SI_SPILL_S192_SAVE: case AMDGPU::SI_SPILL_S192_RESTORE: @@ -936,6 +948,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V192_RESTORE: case AMDGPU::SI_SPILL_A192_SAVE: case AMDGPU::SI_SPILL_A192_RESTORE: + case AMDGPU::SI_SPILL_AV192_SAVE: + case AMDGPU::SI_SPILL_AV192_RESTORE: return 6; case AMDGPU::SI_SPILL_S160_SAVE: case AMDGPU::SI_SPILL_S160_RESTORE: @@ -943,6 +957,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V160_RESTORE: case AMDGPU::SI_SPILL_A160_SAVE: case AMDGPU::SI_SPILL_A160_RESTORE: + case AMDGPU::SI_SPILL_AV160_SAVE: + case AMDGPU::SI_SPILL_AV160_RESTORE: return 5; case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: @@ -950,6 +966,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V128_RESTORE: case AMDGPU::SI_SPILL_A128_SAVE: case AMDGPU::SI_SPILL_A128_RESTORE: + case AMDGPU::SI_SPILL_AV128_SAVE: + case AMDGPU::SI_SPILL_AV128_RESTORE: return 4; case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S96_RESTORE: @@ -957,6 +975,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V96_RESTORE: case AMDGPU::SI_SPILL_A96_SAVE: case AMDGPU::SI_SPILL_A96_RESTORE: + case AMDGPU::SI_SPILL_AV96_SAVE: + case AMDGPU::SI_SPILL_AV96_RESTORE: return 3; case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: @@ -964,6 +984,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V64_RESTORE: case AMDGPU::SI_SPILL_A64_SAVE: case AMDGPU::SI_SPILL_A64_RESTORE: + case AMDGPU::SI_SPILL_AV64_SAVE: + case AMDGPU::SI_SPILL_AV64_RESTORE: return 2; case AMDGPU::SI_SPILL_S32_SAVE: case AMDGPU::SI_SPILL_S32_RESTORE: @@ -971,6 +993,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { case AMDGPU::SI_SPILL_V32_RESTORE: case AMDGPU::SI_SPILL_A32_SAVE: case AMDGPU::SI_SPILL_A32_RESTORE: + case AMDGPU::SI_SPILL_AV32_SAVE: + case AMDGPU::SI_SPILL_AV32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -1240,9 +1264,10 @@ void SIRegisterInfo::buildSpillLoadStore( if (ScratchOffsetReg == AMDGPU::NoRegister) { BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset); } else { - BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) + auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset) .addReg(ScratchOffsetReg) .addImm(Offset); + Add->getOperand(3).setIsDead(); // Mark SCC as dead. } Offset = 0; @@ -1810,7 +1835,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_A128_SAVE: case AMDGPU::SI_SPILL_A96_SAVE: case AMDGPU::SI_SPILL_A64_SAVE: - case AMDGPU::SI_SPILL_A32_SAVE: { + case AMDGPU::SI_SPILL_A32_SAVE: + case AMDGPU::SI_SPILL_AV1024_SAVE: + case AMDGPU::SI_SPILL_AV512_SAVE: + case AMDGPU::SI_SPILL_AV256_SAVE: + case AMDGPU::SI_SPILL_AV224_SAVE: + case AMDGPU::SI_SPILL_AV192_SAVE: + case AMDGPU::SI_SPILL_AV160_SAVE: + case AMDGPU::SI_SPILL_AV128_SAVE: + case AMDGPU::SI_SPILL_AV96_SAVE: + case AMDGPU::SI_SPILL_AV64_SAVE: + case AMDGPU::SI_SPILL_AV32_SAVE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == @@ -1846,7 +1881,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, case AMDGPU::SI_SPILL_A224_RESTORE: case AMDGPU::SI_SPILL_A256_RESTORE: case AMDGPU::SI_SPILL_A512_RESTORE: - case AMDGPU::SI_SPILL_A1024_RESTORE: { + case AMDGPU::SI_SPILL_A1024_RESTORE: + case AMDGPU::SI_SPILL_AV32_RESTORE: + case AMDGPU::SI_SPILL_AV64_RESTORE: + case AMDGPU::SI_SPILL_AV96_RESTORE: + case AMDGPU::SI_SPILL_AV128_RESTORE: + case AMDGPU::SI_SPILL_AV160_RESTORE: + case AMDGPU::SI_SPILL_AV192_RESTORE: + case AMDGPU::SI_SPILL_AV224_RESTORE: + case AMDGPU::SI_SPILL_AV256_RESTORE: + case AMDGPU::SI_SPILL_AV512_RESTORE: + case AMDGPU::SI_SPILL_AV1024_RESTORE: { const MachineOperand *VData = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata); assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 3a372d4519fb..c8f1daf26de9 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -731,11 +731,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { continue; } - // getVOPe32 could be -1 here if we started with an instruction that had - // a 32-bit encoding and then commuted it to an instruction that did not. - if (!TII->hasVALU32BitEncoding(MI.getOpcode())) - continue; - int Op32 = AMDGPU::getVOPe32(MI.getOpcode()); if (TII->isVOPC(Op32)) { @@ -776,10 +771,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); - // Check the carry-in operand for v_addc_u32_e64. - const MachineOperand *Src2 = TII->getNamedOperand(MI, - AMDGPU::OpName::src2); - if (SDst) { bool Next = false; @@ -791,6 +782,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { // All of the instructions with carry outs also have an SGPR input in // src2. + const MachineOperand *Src2 = TII->getNamedOperand(MI, + AMDGPU::OpName::src2); if (Src2 && Src2->getReg() != VCCReg) { if (Src2->getReg().isVirtual()) MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg); diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 46012e5d7d97..77ee3c0ff0e4 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -495,11 +495,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, // instruction as needing e.g. WQM before visiting it and realizing it needs // WQM disabled. ReversePostOrderTraversal<MachineFunction *> RPOT(&MF); - for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) { - MachineBasicBlock &MBB = **BI; - BlockInfo &BBI = Blocks[&MBB]; + for (MachineBasicBlock *MBB : RPOT) { + BlockInfo &BBI = Blocks[MBB]; - for (MachineInstr &MI : MBB) { + for (MachineInstr &MI : *MBB) { InstrInfo &III = Instructions[&MI]; unsigned Opcode = MI.getOpcode(); char Flags = 0; @@ -561,7 +560,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, BBI.Needs |= StateExact; if (!(BBI.InNeeds & StateExact)) { BBI.InNeeds |= StateExact; - Worklist.push_back(&MBB); + Worklist.push_back(MBB); } GlobalFlags |= StateExact; III.Disabled = StateWQM | StateStrict; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 8502ed61b366..184c871db775 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -181,15 +181,8 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse " $sdst", [(set i64:$sdst, (node))]> { let hasSideEffects = 1; - // FIXME: This should be definitively mayStore = 0. TableGen - // brokenly tries to infer these based on the intrinsic properties - // corresponding to the IR attributes. The target intrinsics are - // considered as writing to memory for IR dependency purposes, but - // those can be modeled with hasSideEffects here. These also end up - // inferring differently for llvm.readcyclecounter and the amdgcn - // intrinsics. - let mayStore = ?; - let mayLoad = 1; + let mayStore = 0; + let mayLoad = 0; let has_sbase = 0; let has_offset = 0; } @@ -765,11 +758,11 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL }]; } -def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; -def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; -def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; -def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; -def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; +def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">; +def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">; +def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">; +def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">; +def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">; multiclass SMRD_Pattern <string Instr, ValueType vt> { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 61ecc13620a1..1713586dcf5b 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -157,6 +157,42 @@ class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> : let has_sdst = 0; } +class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return !N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + +class UniformBinFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0, node:$src1), + (Op $src0, $src1), + [{ return !N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + +class DivergentBinFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0, node:$src1), + (Op $src0, $src1), + [{ return N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + let isMoveImm = 1 in { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { @@ -172,11 +208,11 @@ let isMoveImm = 1 in { let Defs = [SCC] in { def S_NOT_B32 : SOP1_32 <"s_not_b32", - [(set i32:$sdst, (not i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))] >; def S_NOT_B64 : SOP1_64 <"s_not_b64", - [(set i64:$sdst, (not i64:$src0))] + [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))] >; def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; @@ -221,22 +257,22 @@ let isReMaterializable = 1 in { def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">; def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">; def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64", - [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))] >; def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32", - [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i32:$src0))] >; def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32", - [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i32:$src0))] >; def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64", - [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))] >; def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32", - [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))] >; def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">; def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8", @@ -426,41 +462,6 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo < "$sdst, $src0, $src1", pattern >; -class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < - (ops node:$src0), - (Op $src0), - [{ return !N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} - -class UniformBinFrag<SDPatternOperator Op> : PatFrag < - (ops node:$src0, node:$src1), - (Op $src0, $src1), - [{ return !N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} - -class DivergentBinFrag<SDPatternOperator Op> : PatFrag < - (ops node:$src0, node:$src1), - (Op $src0, $src1), - [{ return N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { @@ -485,19 +486,18 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32", [(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>; } // End Uses = [SCC] - let isCommutable = 1 in { def S_MIN_I32 : SOP2_32 <"s_min_i32", - [(set i32:$sdst, (smin i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))] >; def S_MIN_U32 : SOP2_32 <"s_min_u32", - [(set i32:$sdst, (umin i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))] >; def S_MAX_I32 : SOP2_32 <"s_max_i32", - [(set i32:$sdst, (smax i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))] >; def S_MAX_U32 : SOP2_32 <"s_max_u32", - [(set i32:$sdst, (umax i32:$src0, i32:$src1))] + [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))] >; } // End isCommutable = 1 } // End Defs = [SCC] @@ -870,7 +870,7 @@ def S_GETREG_B32 : SOPK_Pseudo < } } // End mayLoad = 1 -let mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] in { +let Defs = [MODE], Uses = [MODE] in { // FIXME: Need to truncate immediate to 16-bits. class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo < @@ -914,7 +914,7 @@ def S_SETREG_IMM32_B32_mode : S_SETREG_IMM32_B32_Pseudo { let hasSideEffects = 0; } -} // End mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] +} // End Defs = [MODE], Uses = [MODE] class SOPK_WAITCNT<string opName, list<dag> pat=[]> : SOPK_Pseudo< @@ -1264,7 +1264,7 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > { let mayStore = 1; } -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +let hasSideEffects = 1 in def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16", [(int_amdgcn_s_waitcnt timm:$simm16)]>; def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16", @@ -1278,8 +1278,6 @@ def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">; def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 0; - let mayStore = 0; } def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">; @@ -1305,14 +1303,10 @@ def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> { def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_incperflevel timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 0; - let mayStore = 0; } def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16", [(int_amdgcn_s_decperflevel timm:$simm16)]> { let hasSideEffects = 1; - let mayLoad = 0; - let mayStore = 0; } def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> { let simm16 = 0; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp index 2e4d83fbbc39..a83ff6667956 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp @@ -15,7 +15,6 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Constants.h" #include "llvm/IR/ReplaceConstant.h" @@ -25,175 +24,6 @@ namespace llvm { namespace AMDGPU { -// An helper class for collecting all reachable callees for each kernel defined -// within the module. -class CollectReachableCallees { - Module &M; - CallGraph CG; - SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions; - - // Collect all address taken functions within the module. - void collectAddressTakenFunctions() { - auto *ECNode = CG.getExternalCallingNode(); - - for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) { - auto *CGN = GI->second; - auto *F = CGN->getFunction(); - if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F)) - continue; - AddressTakenFunctions.insert(CGN); - } - } - - // For given kernel, collect all its reachable non-kernel functions. - SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) { - SmallPtrSet<Function *, 8> ReachableCallees; - - // Call graph node which represents this kernel. - auto *KCGN = CG[K]; - - // Go through all call graph nodes reachable from the node representing this - // kernel, visit all their call sites, if the call site is direct, add - // corresponding callee to reachable callee set, if it is indirect, resolve - // the indirect call site to potential reachable callees, add them to - // reachable callee set, and repeat the process for the newly added - // potential callee nodes. - // - // FIXME: Need to handle bit-casted function pointers. - // - SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN)); - SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes; - while (!CGNStack.empty()) { - auto *CGN = CGNStack.pop_back_val(); - - if (!VisitedCGNodes.insert(CGN).second) - continue; - - // Ignore call graph node which does not have associated function or - // associated function is not a definition. - if (!CGN->getFunction() || CGN->getFunction()->isDeclaration()) - continue; - - for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) { - auto *RCB = cast<CallBase>(GI->first.getValue()); - auto *RCGN = GI->second; - - if (auto *DCallee = RCGN->getFunction()) { - ReachableCallees.insert(DCallee); - } else if (RCB->isIndirectCall()) { - auto *RCBFTy = RCB->getFunctionType(); - for (auto *ACGN : AddressTakenFunctions) { - auto *ACallee = ACGN->getFunction(); - if (ACallee->getFunctionType() == RCBFTy) { - ReachableCallees.insert(ACallee); - CGNStack.append(df_begin(ACGN), df_end(ACGN)); - } - } - } - } - } - - return ReachableCallees; - } - -public: - explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) { - // Collect address taken functions. - collectAddressTakenFunctions(); - } - - void collectReachableCallees( - DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { - // Collect reachable callee set for each kernel defined in the module. - for (Function &F : M.functions()) { - if (!AMDGPU::isKernelCC(&F)) - continue; - Function *K = &F; - KernelToCallees[K] = collectReachableCallees(K); - } - } -}; - -void collectReachableCallees( - Module &M, - DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) { - CollectReachableCallees CRC{M}; - CRC.collectReachableCallees(KernelToCallees); -} - -SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) { - SmallPtrSet<Function *, 8> LDSAccessors; - SmallVector<User *, 8> UserStack(GV->users()); - SmallPtrSet<User *, 8> VisitedUsers; - - while (!UserStack.empty()) { - auto *U = UserStack.pop_back_val(); - - // `U` is already visited? continue to next one. - if (!VisitedUsers.insert(U).second) - continue; - - // `U` is a global variable which is initialized with LDS. Ignore LDS. - if (isa<GlobalValue>(U)) - return SmallPtrSet<Function *, 8>(); - - // Recursively explore constant users. - if (isa<Constant>(U)) { - append_range(UserStack, U->users()); - continue; - } - - // `U` should be an instruction, if it belongs to a non-kernel function F, - // then collect F. - Function *F = cast<Instruction>(U)->getFunction(); - if (!AMDGPU::isKernelCC(F)) - LDSAccessors.insert(F); - } - - return LDSAccessors; -} - -DenseMap<Function *, SmallPtrSet<Instruction *, 8>> -getFunctionToInstsMap(User *U, bool CollectKernelInsts) { - DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts; - SmallVector<User *, 8> UserStack; - SmallPtrSet<User *, 8> VisitedUsers; - - UserStack.push_back(U); - - while (!UserStack.empty()) { - auto *UU = UserStack.pop_back_val(); - - if (!VisitedUsers.insert(UU).second) - continue; - - if (isa<GlobalValue>(UU)) - continue; - - if (isa<Constant>(UU)) { - append_range(UserStack, UU->users()); - continue; - } - - auto *I = cast<Instruction>(UU); - Function *F = I->getFunction(); - if (CollectKernelInsts) { - if (!AMDGPU::isKernelCC(F)) { - continue; - } - } else { - if (AMDGPU::isKernelCC(F)) { - continue; - } - } - - FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>())); - FunctionToInsts[F].insert(I); - } - - return FunctionToInsts; -} - bool isKernelCC(const Function *Func) { return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv()); } @@ -232,26 +62,8 @@ void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) { } } -bool hasUserInstruction(const GlobalValue *GV) { - SmallPtrSet<const User *, 8> Visited; - SmallVector<const User *, 16> Stack(GV->users()); - - while (!Stack.empty()) { - const User *U = Stack.pop_back_val(); - - if (!Visited.insert(U).second) - continue; - - if (isa<Instruction>(U)) - return true; - - append_range(Stack, U->users()); - } - - return false; -} - -bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { +static bool shouldLowerLDSToStruct(const GlobalVariable &GV, + const Function *F) { // We are not interested in kernel LDS lowering for module LDS itself. if (F && GV.getName() == "llvm.amdgcn.module.lds") return false; @@ -259,7 +71,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { bool Ret = false; SmallPtrSet<const User *, 8> Visited; SmallVector<const User *, 16> Stack(GV.users()); - SmallPtrSet<const GlobalValue *, 8> GlobalUsers; assert(!F || isKernelCC(F)); @@ -267,15 +78,10 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { const User *V = Stack.pop_back_val(); Visited.insert(V); - if (auto *G = dyn_cast<GlobalValue>(V)) { - StringRef GName = G->getName(); - if (F && GName != "llvm.used" && GName != "llvm.compiler.used") { - // For kernel LDS lowering, if G is not a compiler.used list, then we - // cannot lower the lds GV since we cannot replace the use of GV within - // G. - return false; - } - GlobalUsers.insert(G); + if (isa<GlobalValue>(V)) { + // This use of the LDS variable is the initializer of a global variable. + // This is ill formed. The address of an LDS variable is kernel dependent + // and unknown until runtime. It can't be written to a global variable. continue; } @@ -297,15 +103,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) { append_range(Stack, V->users()); } - if (!F && !Ret) { - // For module LDS lowering, we have not yet decided if we should lower GV or - // not. Explore all global users of GV, and check if atleast one of these - // global users appear as an use within an instruction (possibly nested use - // via constant expression), if so, then conservately lower LDS. - for (auto *G : GlobalUsers) - Ret |= hasUserInstruction(G); - } - return Ret; } @@ -324,7 +121,7 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M, continue; } if (!isa<UndefValue>(GV.getInitializer())) { - // Initializers are unimplemented for local address space. + // Initializers are unimplemented for LDS address space. // Leave such variables in place for consistent error reporting. continue; } @@ -342,20 +139,6 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M, return LocalVars; } -SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) { - SmallPtrSet<GlobalValue *, 32> UsedList; - - SmallVector<GlobalValue *, 32> TmpVec; - collectUsedGlobalVariables(M, TmpVec, true); - UsedList.insert(TmpVec.begin(), TmpVec.end()); - - TmpVec.clear(); - collectUsedGlobalVariables(M, TmpVec, false); - UsedList.insert(TmpVec.begin(), TmpVec.end()); - - return UsedList; -} - } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h index d1c9229bc336..83ef68cc3f60 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h @@ -22,44 +22,13 @@ class ConstantExpr; namespace AMDGPU { -/// Collect reachable callees for each kernel defined in the module \p M and -/// return collected callees at \p KernelToCallees. -void collectReachableCallees( - Module &M, - DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees); - -/// For the given LDS global \p GV, visit all its users and collect all -/// non-kernel functions within which \p GV is used and return collected list of -/// such non-kernel functions. -SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV); - -/// Collect all the instructions where user \p U belongs to. \p U could be -/// instruction itself or it could be a constant expression which is used within -/// an instruction. If \p CollectKernelInsts is true, collect instructions only -/// from kernels, otherwise collect instructions only from non-kernel functions. -DenseMap<Function *, SmallPtrSet<Instruction *, 8>> -getFunctionToInstsMap(User *U, bool CollectKernelInsts); - bool isKernelCC(const Function *Func); Align getAlign(DataLayout const &DL, const GlobalVariable *GV); -/// \returns true if a given global variable \p GV (or its global users) appear -/// as an use within some instruction (either from kernel or from non-kernel). -bool hasUserInstruction(const GlobalValue *GV); - -/// \returns true if an LDS global requires lowering to a module LDS structure -/// if \p F is not given. If \p F is given it must be a kernel and function -/// \returns true if an LDS global is directly used from that kernel and it -/// is safe to replace its uses with a kernel LDS structure member. -bool shouldLowerLDSToStruct(const GlobalVariable &GV, - const Function *F = nullptr); - std::vector<GlobalVariable *> findVariablesToLower(Module &M, const Function *F = nullptr); -SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M); - /// Replace all uses of constant \p C with instructions in \p F. void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F); } // end namespace AMDGPU diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index a3eccf13cd71..a8368892c565 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -794,6 +794,18 @@ class VOPPatGen<SDPatternOperator Op, VOPProfile P> { list<dag> ret = [!con(Outs, (set Ins))]; } +class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> { list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []); } diff --git a/llvm/lib/Target/ARM/A15SDOptimizer.cpp b/llvm/lib/Target/ARM/A15SDOptimizer.cpp index f4d0f4a6d6b0..d0efecad63bc 100644 --- a/llvm/lib/Target/ARM/A15SDOptimizer.cpp +++ b/llvm/lib/Target/ARM/A15SDOptimizer.cpp @@ -592,16 +592,15 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { SmallVector<unsigned, 8> Defs = getReadDPRs(MI); bool Modified = false; - for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end(); - I != E; ++I) { + for (unsigned I : Defs) { // Follow the def-use chain for this DPR through COPYs, and also through // PHIs (which are essentially multi-way COPYs). It is because of PHIs that // we can end up with multiple defs of this DPR. SmallVector<MachineInstr *, 8> DefSrcs; - if (!Register::isVirtualRegister(*I)) + if (!Register::isVirtualRegister(I)) continue; - MachineInstr *Def = MRI->getVRegDef(*I); + MachineInstr *Def = MRI->getVRegDef(I); if (!Def) continue; @@ -628,18 +627,17 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) { if (NewReg != 0) { Modified = true; - for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(), - E = Uses.end(); I != E; ++I) { + for (MachineOperand *Use : Uses) { // Make sure to constrain the register class of the new register to // match what we're replacing. Otherwise we can optimize a DPR_VFP2 // reference into a plain DPR, and that will end poorly. NewReg is // always virtual here, so there will always be a matching subclass // to find. - MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg())); + MRI->constrainRegClass(NewReg, MRI->getRegClass(Use->getReg())); - LLVM_DEBUG(dbgs() << "Replacing operand " << **I << " with " + LLVM_DEBUG(dbgs() << "Replacing operand " << *Use << " with " << printReg(NewReg) << "\n"); - (*I)->substVirtReg(NewReg, 0, *TRI); + Use->substVirtReg(NewReg, 0, *TRI); } } Replacements[MI] = NewReg; diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td index e03dd597eb65..8173fe4036a8 100644 --- a/llvm/lib/Target/ARM/ARM.td +++ b/llvm/lib/Target/ARM/ARM.td @@ -446,6 +446,11 @@ def FeaturePACBTI : SubtargetFeature<"pacbti", "HasPACBTI", "true", "Enable Pointer Authentication and Branch " "Target Identification">; +def FeatureNoBTIAtReturnTwice : SubtargetFeature<"no-bti-at-return-twice", + "NoBTIAtReturnTwice", "true", + "Don't place a BTI instruction " + "after a return-twice">; + //===----------------------------------------------------------------------===// // ARM architecture class // diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 6a88ac485e69..fa09b2567aa9 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -1153,8 +1153,12 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { unsigned StartOp = 2 + 2; // Use all the operands. unsigned NumOffset = 0; - // Amount of SP adjustment folded into a push. - unsigned Pad = 0; + // Amount of SP adjustment folded into a push, before the + // registers are stored (pad at higher addresses). + unsigned PadBefore = 0; + // Amount of SP adjustment folded into a push, after the + // registers are stored (pad at lower addresses). + unsigned PadAfter = 0; switch (Opc) { default: @@ -1185,7 +1189,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { "Pad registers must come before restored ones"); unsigned Width = TargetRegInfo->getRegSizeInBits(MO.getReg(), MachineRegInfo) / 8; - Pad += Width; + PadAfter += Width; continue; } // Check for registers that are remapped (for a Thumb1 prologue that @@ -1201,14 +1205,32 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { case ARM::t2STR_PRE: assert(MI->getOperand(2).getReg() == ARM::SP && "Only stack pointer as a source reg is supported"); + if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(SrcReg)) + SrcReg = RemappedReg; + + RegList.push_back(SrcReg); + break; + case ARM::t2STRD_PRE: + assert(MI->getOperand(3).getReg() == ARM::SP && + "Only stack pointer as a source reg is supported"); + SrcReg = MI->getOperand(1).getReg(); + if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(SrcReg)) + SrcReg = RemappedReg; + RegList.push_back(SrcReg); + SrcReg = MI->getOperand(2).getReg(); + if (unsigned RemappedReg = AFI->EHPrologueRemappedRegs.lookup(SrcReg)) + SrcReg = RemappedReg; RegList.push_back(SrcReg); + PadBefore = -MI->getOperand(4).getImm() - 8; break; } if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) { + if (PadBefore) + ATS.emitPad(PadBefore); ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD); // Account for the SP adjustment, folded into the push. - if (Pad) - ATS.emitPad(Pad); + if (PadAfter) + ATS.emitPad(PadAfter); } } else { // Changes of stack / frame pointer. @@ -1300,6 +1322,10 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { Offset = MI->getOperand(2).getImm(); AFI->EHPrologueOffsetInRegs[DstReg] |= (Offset << 16); break; + case ARM::t2PAC: + case ARM::t2PACBTI: + AFI->EHPrologueRemappedRegs[ARM::R12] = ARM::RA_AUTH_CODE; + break; default: MI->print(errs()); llvm_unreachable("Unsupported opcode for unwinding information"); diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 2a12947d24a8..884f38ff6c58 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -2629,8 +2629,8 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, // Add the complete list back in. MachineInstrBuilder MIB(MF, &*MI); - for (int i = RegList.size() - 1; i >= 0; --i) - MIB.add(RegList[i]); + for (const MachineOperand &MO : llvm::reverse(RegList)) + MIB.add(MO); return true; } @@ -5678,7 +5678,7 @@ bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, /// | | Thumb2 | ARM | /// +-------------------------+--------+-----+ /// | Call overhead in Bytes | 4 | 4 | -/// | Frame overhead in Bytes | 4 | 4 | +/// | Frame overhead in Bytes | 2 | 4 | /// | Stack fixup required | No | No | /// +-------------------------+--------+-----+ /// @@ -5755,7 +5755,7 @@ struct OutlinerCosts { CallThunk(target.isThumb() ? 4 : 4), FrameThunk(target.isThumb() ? 0 : 0), CallNoLRSave(target.isThumb() ? 4 : 4), - FrameNoLRSave(target.isThumb() ? 4 : 4), + FrameNoLRSave(target.isThumb() ? 2 : 4), CallRegSave(target.isThumb() ? 8 : 12), FrameRegSave(target.isThumb() ? 2 : 4), CallDefault(target.isThumb() ? 8 : 12), @@ -5868,11 +5868,17 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( return outliner::OutlinedFunction(); } + // We expect the majority of the outlining candidates to be in consensus with + // regard to return address sign and authentication, and branch target + // enforcement, in other words, partitioning according to all the four + // possible combinations of PAC-RET and BTI is going to yield one big subset + // and three small (likely empty) subsets. That allows us to cull incompatible + // candidates separately for PAC-RET and BTI. + // Partition the candidates in two sets: one with BTI enabled and one with BTI - // disabled. Remove the candidates from the smaller set. We expect the - // majority of the candidates to be in consensus with regard to branch target - // enforcement with just a few oddballs, but if they are the same number - // prefer the non-BTI ones for outlining, since they have less overhead. + // disabled. Remove the candidates from the smaller set. If they are the same + // number prefer the non-BTI ones for outlining, since they have less + // overhead. auto NoBTI = llvm::partition(RepeatedSequenceLocs, [](const outliner::Candidate &C) { const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>(); @@ -5883,6 +5889,24 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( RepeatedSequenceLocs.erase(NoBTI, RepeatedSequenceLocs.end()); else RepeatedSequenceLocs.erase(RepeatedSequenceLocs.begin(), NoBTI); + + if (RepeatedSequenceLocs.size() < 2) + return outliner::OutlinedFunction(); + + // Likewise, partition the candidates according to PAC-RET enablement. + auto NoPAC = + llvm::partition(RepeatedSequenceLocs, [](const outliner::Candidate &C) { + const ARMFunctionInfo &AFI = *C.getMF()->getInfo<ARMFunctionInfo>(); + // If the function happens to not spill the LR, do not disqualify it + // from the outlining. + return AFI.shouldSignReturnAddress(true); + }); + if (std::distance(RepeatedSequenceLocs.begin(), NoPAC) > + std::distance(NoPAC, RepeatedSequenceLocs.end())) + RepeatedSequenceLocs.erase(NoPAC, RepeatedSequenceLocs.end()); + else + RepeatedSequenceLocs.erase(RepeatedSequenceLocs.begin(), NoPAC); + if (RepeatedSequenceLocs.size() < 2) return outliner::OutlinedFunction(); @@ -5899,6 +5923,7 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( }; OutlinerCosts Costs(Subtarget); + const auto &SomeMFI = *RepeatedSequenceLocs.front().getMF()->getInfo<ARMFunctionInfo>(); // Adjust costs to account for the BTI instructions. @@ -5909,6 +5934,13 @@ outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo( Costs.FrameTailCall += 4; Costs.FrameThunk += 4; } + + // Adjust costs to account for sign and authentication instructions. + if (SomeMFI.shouldSignReturnAddress(true)) { + Costs.CallDefault += 8; // +PAC instr, +AUT instr + Costs.SaveRestoreLROnStack += 8; // +PAC instr, +AUT instr + } + unsigned FrameID = MachineOutlinerDefault; unsigned NumBytesToCreateFrame = Costs.FrameDefault; @@ -6325,6 +6357,11 @@ ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, // * LR is available in the range (No save/restore around call) // * The range doesn't include calls (No save/restore in outlined frame) // are true. + // These conditions also ensure correctness of the return address + // authentication - we insert sign and authentication instructions only if + // we save/restore LR on stack, but then this condition ensures that the + // outlined range does not modify the SP, therefore the SP value used for + // signing is the same as the one used for authentication. // FIXME: This is very restrictive; the flags check the whole block, // not just the bit we will try to outline. bool MightNeedStackFixUp = @@ -6369,23 +6406,39 @@ void ARMBaseInstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { } void ARMBaseInstrInfo::saveLROnStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const { - unsigned Opc = Subtarget.isThumb() ? ARM::t2STR_PRE : ARM::STR_PRE_IMM; - int Align = -Subtarget.getStackAlignment().value(); - BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::SP) - .addReg(ARM::LR, RegState::Kill) - .addReg(ARM::SP) - .addImm(Align) - .add(predOps(ARMCC::AL)); -} + MachineBasicBlock::iterator It, bool CFI, + bool Auth) const { + int Align = std::max(Subtarget.getStackAlignment().value(), uint64_t(8)); + assert(Align >= 8 && Align <= 256); + if (Auth) { + assert(Subtarget.isThumb2()); + // Compute PAC in R12. Outlining ensures R12 is dead across the outlined + // sequence. + BuildMI(MBB, It, DebugLoc(), get(ARM::t2PAC)) + .setMIFlags(MachineInstr::FrameSetup); + BuildMI(MBB, It, DebugLoc(), get(ARM::t2STRD_PRE), ARM::SP) + .addReg(ARM::R12, RegState::Kill) + .addReg(ARM::LR, RegState::Kill) + .addReg(ARM::SP) + .addImm(-Align) + .add(predOps(ARMCC::AL)) + .setMIFlags(MachineInstr::FrameSetup); + } else { + unsigned Opc = Subtarget.isThumb() ? ARM::t2STR_PRE : ARM::STR_PRE_IMM; + BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::SP) + .addReg(ARM::LR, RegState::Kill) + .addReg(ARM::SP) + .addImm(-Align) + .add(predOps(ARMCC::AL)) + .setMIFlags(MachineInstr::FrameSetup); + } + + if (!CFI) + return; -void ARMBaseInstrInfo::emitCFIForLRSaveOnStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - int Align = Subtarget.getStackAlignment().value(); - // Add a CFI saying the stack was moved down. + + // Add a CFI, saying CFA is offset by Align bytes from SP. int64_t StackPosEntry = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Align)); BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) @@ -6394,11 +6447,23 @@ void ARMBaseInstrInfo::emitCFIForLRSaveOnStack( // Add a CFI saying that the LR that we want to find is now higher than // before. - int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfLR, -Align)); + int LROffset = Auth ? Align - 4 : Align; + const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); + unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); + int64_t LRPosEntry = MF.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfLR, -LROffset)); BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) .addCFIIndex(LRPosEntry) .setMIFlags(MachineInstr::FrameSetup); + if (Auth) { + // Add a CFI for the location of the return adddress PAC. + unsigned DwarfRAC = MRI->getDwarfRegNum(ARM::RA_AUTH_CODE, true); + int64_t RACPosEntry = MF.addFrameInst( + MCCFIInstruction::createOffset(nullptr, DwarfRAC, -Align)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(RACPosEntry) + .setMIFlags(MachineInstr::FrameSetup); + } } void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB, @@ -6416,35 +6481,64 @@ void ARMBaseInstrInfo::emitCFIForLRSaveToReg(MachineBasicBlock &MBB, .setMIFlags(MachineInstr::FrameSetup); } -void ARMBaseInstrInfo::restoreLRFromStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - unsigned Opc = Subtarget.isThumb() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; - MachineInstrBuilder MIB = BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::LR) - .addReg(ARM::SP, RegState::Define) - .addReg(ARM::SP); - if (!Subtarget.isThumb()) - MIB.addReg(0); - MIB.addImm(Subtarget.getStackAlignment().value()).add(predOps(ARMCC::AL)); -} +void ARMBaseInstrInfo::restoreLRFromStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator It, + bool CFI, bool Auth) const { + int Align = Subtarget.getStackAlignment().value(); + if (Auth) { + assert(Subtarget.isThumb2()); + // Restore return address PAC and LR. + BuildMI(MBB, It, DebugLoc(), get(ARM::t2LDRD_POST)) + .addReg(ARM::R12, RegState::Define) + .addReg(ARM::LR, RegState::Define) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP) + .addImm(Align) + .add(predOps(ARMCC::AL)) + .setMIFlags(MachineInstr::FrameDestroy); + // LR authentication is after the CFI instructions, below. + } else { + unsigned Opc = Subtarget.isThumb() ? ARM::t2LDR_POST : ARM::LDR_POST_IMM; + MachineInstrBuilder MIB = BuildMI(MBB, It, DebugLoc(), get(Opc), ARM::LR) + .addReg(ARM::SP, RegState::Define) + .addReg(ARM::SP); + if (!Subtarget.isThumb()) + MIB.addReg(0); + MIB.addImm(Subtarget.getStackAlignment().value()) + .add(predOps(ARMCC::AL)) + .setMIFlags(MachineInstr::FrameDestroy); + } -void ARMBaseInstrInfo::emitCFIForLRRestoreFromStack( - MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - // Now stack has moved back up... - MachineFunction &MF = *MBB.getParent(); - const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); - unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); - int64_t StackPosEntry = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(StackPosEntry) - .setMIFlags(MachineInstr::FrameDestroy); + if (CFI) { + // Now stack has moved back up... + MachineFunction &MF = *MBB.getParent(); + const MCRegisterInfo *MRI = Subtarget.getRegisterInfo(); + unsigned DwarfLR = MRI->getDwarfRegNum(ARM::LR, true); + int64_t StackPosEntry = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(StackPosEntry) + .setMIFlags(MachineInstr::FrameDestroy); - // ... and we have restored LR. - int64_t LRPosEntry = - MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR)); - BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) - .addCFIIndex(LRPosEntry) - .setMIFlags(MachineInstr::FrameDestroy); + // ... and we have restored LR. + int64_t LRPosEntry = + MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, DwarfLR)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(LRPosEntry) + .setMIFlags(MachineInstr::FrameDestroy); + + if (Auth) { + unsigned DwarfRAC = MRI->getDwarfRegNum(ARM::RA_AUTH_CODE, true); + int64_t Entry = + MF.addFrameInst(MCCFIInstruction::createUndefined(nullptr, DwarfRAC)); + BuildMI(MBB, It, DebugLoc(), get(ARM::CFI_INSTRUCTION)) + .addCFIIndex(Entry) + .setMIFlags(MachineInstr::FrameDestroy); + } + } + + if (Auth) + BuildMI(MBB, It, DebugLoc(), get(ARM::t2AUT)); } void ARMBaseInstrInfo::emitCFIForLRRestoreFromReg( @@ -6500,8 +6594,11 @@ void ARMBaseInstrInfo::buildOutlinedFrame( MBB.addLiveIn(ARM::LR); // Insert a save before the outlined region - saveLROnStack(MBB, It); - emitCFIForLRSaveOnStack(MBB, It); + bool Auth = OF.Candidates.front() + .getMF() + ->getInfo<ARMFunctionInfo>() + ->shouldSignReturnAddress(true); + saveLROnStack(MBB, It, true, Auth); // Fix up the instructions in the range, since we're going to modify the // stack. @@ -6510,8 +6607,7 @@ void ARMBaseInstrInfo::buildOutlinedFrame( fixupPostOutline(MBB); // Insert a restore before the terminator for the function. Restore LR. - restoreLRFromStack(MBB, Et); - emitCFIForLRRestoreFromStack(MBB, Et); + restoreLRFromStack(MBB, Et, true, Auth); } // If this is a tail call outlined function, then there's already a return. @@ -6590,13 +6686,10 @@ MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall( // We have the default case. Save and restore from SP. if (!MBB.isLiveIn(ARM::LR)) MBB.addLiveIn(ARM::LR); - saveLROnStack(MBB, It); - if (!AFI.isLRSpilled()) - emitCFIForLRSaveOnStack(MBB, It); + bool Auth = !AFI.isLRSpilled() && AFI.shouldSignReturnAddress(true); + saveLROnStack(MBB, It, !AFI.isLRSpilled(), Auth); CallPt = MBB.insert(It, CallMIB); - restoreLRFromStack(MBB, It); - if (!AFI.isLRSpilled()) - emitCFIForLRRestoreFromStack(MBB, It); + restoreLRFromStack(MBB, It, !AFI.isLRSpilled(), Auth); It--; return CallPt; } diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 5fa912ae35d7..defce07dd862 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -377,20 +377,20 @@ private: /// constructing an outlined call if one exists. Returns 0 otherwise. unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const; - // Adds an instruction which saves the link register on top of the stack into - /// the MachineBasicBlock \p MBB at position \p It. - void saveLROnStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; + /// Adds an instruction which saves the link register on top of the stack into + /// the MachineBasicBlock \p MBB at position \p It. If \p Auth is true, + /// compute and store an authentication code alongiside the link register. + /// If \p CFI is true, emit CFI instructions. + void saveLROnStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator It, + bool CFI, bool Auth) const; /// Adds an instruction which restores the link register from the top the - /// stack into the MachineBasicBlock \p MBB at position \p It. + /// stack into the MachineBasicBlock \p MBB at position \p It. If \p Auth is + /// true, restore an authentication code and authenticate LR. + /// If \p CFI is true, emit CFI instructions. void restoreLRFromStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; - - /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, - /// for the case when the LR is saved on the stack. - void emitCFIForLRSaveOnStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; + MachineBasicBlock::iterator It, bool CFI, + bool Auth) const; /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, /// for the case when the LR is saved in the register \p Reg. @@ -399,11 +399,6 @@ private: Register Reg) const; /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, - /// after the LR is was restored from the stack. - void emitCFIForLRRestoreFromStack(MachineBasicBlock &MBB, - MachineBasicBlock::iterator It) const; - - /// Emit CFI instructions into the MachineBasicBlock \p MBB at position \p It, /// after the LR is was restored from a register. void emitCFIForLRRestoreFromReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const; diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index b53efe58e8de..c543d02ff75a 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -530,6 +530,8 @@ getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const { unsigned ImmIdx = 0; switch (AddrMode) { case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i8neg: + case ARMII::AddrModeT2_i8pos: case ARMII::AddrModeT2_i12: case ARMII::AddrMode_i12: InstrOffs = MI->getOperand(Idx+1).getImm(); @@ -728,6 +730,8 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, bool isSigned = true; switch (AddrMode) { case ARMII::AddrModeT2_i8: + case ARMII::AddrModeT2_i8pos: + case ARMII::AddrModeT2_i8neg: case ARMII::AddrModeT2_i12: // i8 supports only negative, and i12 supports only positive, so // based on Offset sign, consider the appropriate instruction diff --git a/llvm/lib/Target/ARM/ARMBranchTargets.cpp b/llvm/lib/Target/ARM/ARMBranchTargets.cpp index 1091c1f970fa..8ba3e627c039 100644 --- a/llvm/lib/Target/ARM/ARMBranchTargets.cpp +++ b/llvm/lib/Target/ARM/ARMBranchTargets.cpp @@ -108,6 +108,7 @@ void ARMBranchTargets::addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB, bool IsFirstBB) { // Which instruction to insert: BTI or PACBTI unsigned OpCode = ARM::t2BTI; + unsigned MIFlags = 0; // Skip meta instructions, including EH labels auto MBBI = llvm::find_if_not(MBB.instrs(), [](const MachineInstr &MI) { @@ -121,6 +122,7 @@ void ARMBranchTargets::addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB, LLVM_DEBUG(dbgs() << "Removing a 'PAC' instr from BB '" << MBB.getName() << "' to replace with PACBTI\n"); OpCode = ARM::t2PACBTI; + MIFlags = MachineInstr::FrameSetup; auto NextMBBI = std::next(MBBI); MBBI->eraseFromParent(); MBBI = NextMBBI; @@ -131,5 +133,6 @@ void ARMBranchTargets::addBTI(const ARMInstrInfo &TII, MachineBasicBlock &MBB, << (OpCode == ARM::t2BTI ? "BTI" : "PACBTI") << "' instr into BB '" << MBB.getName() << "'\n"); // Finally, insert a new instruction (either PAC or PACBTI) - BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII.get(OpCode)); + BuildMI(MBB, MBBI, MBB.findDebugLoc(MBBI), TII.get(OpCode)) + .setMIFlags(MIFlags); } diff --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp index d8d9ca3b912f..32f3a4a632f5 100644 --- a/llvm/lib/Target/ARM/ARMCallingConv.cpp +++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp @@ -230,10 +230,9 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT, unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size()); if (RegResult) { - for (SmallVectorImpl<CCValAssign>::iterator It = PendingMembers.begin(); - It != PendingMembers.end(); ++It) { - It->convertToReg(RegResult); - State.addLoc(*It); + for (CCValAssign &PendingMember : PendingMembers) { + PendingMember.convertToReg(RegResult); + State.addLoc(PendingMember); ++RegResult; } PendingMembers.clear(); diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index c2ca4708c208..a2a4f1f3bdfd 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -310,8 +310,7 @@ void ARMConstantIslands::verify() { BBInfo[RHS.getNumber()].postOffset(); })); LLVM_DEBUG(dbgs() << "Verifying " << CPUsers.size() << " CP users.\n"); - for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { - CPUser &U = CPUsers[i]; + for (CPUser &U : CPUsers) { unsigned UserOffset = getUserOffset(U); // Verify offset using the real max displacement without the safety // adjustment. @@ -697,10 +696,9 @@ ARMConstantIslands::findConstPoolEntry(unsigned CPI, std::vector<CPEntry> &CPEs = CPEntries[CPI]; // Number of entries per constpool index should be small, just do a // linear search. - for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { - if (CPEs[i].CPEMI == CPEMI) - return &CPEs[i]; - } + for (CPEntry &CPE : CPEs) + if (CPE.CPEMI == CPEMI) + return &CPE; return nullptr; } @@ -1234,27 +1232,27 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset) { // No. Look for previously created clones of the CPE that are in range. unsigned CPI = getCombinedIndex(CPEMI); std::vector<CPEntry> &CPEs = CPEntries[CPI]; - for (unsigned i = 0, e = CPEs.size(); i != e; ++i) { + for (CPEntry &CPE : CPEs) { // We already tried this one - if (CPEs[i].CPEMI == CPEMI) + if (CPE.CPEMI == CPEMI) continue; // Removing CPEs can leave empty entries, skip - if (CPEs[i].CPEMI == nullptr) + if (CPE.CPEMI == nullptr) continue; - if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(), - U.NegOk)) { - LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" - << CPEs[i].CPI << "\n"); + if (isCPEntryInRange(UserMI, UserOffset, CPE.CPEMI, U.getMaxDisp(), + U.NegOk)) { + LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" << CPE.CPI + << "\n"); // Point the CPUser node to the replacement - U.CPEMI = CPEs[i].CPEMI; + U.CPEMI = CPE.CPEMI; // Change the CPI in the instruction operand to refer to the clone. for (MachineOperand &MO : UserMI->operands()) if (MO.isCPI()) { - MO.setIndex(CPEs[i].CPI); + MO.setIndex(CPE.CPI); break; } // Adjust the refcount of the clone... - CPEs[i].RefCount++; + CPE.RefCount++; // ...and the original. If we didn't remove the old entry, none of the // addresses changed, so we don't need another pass. return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1; @@ -1675,15 +1673,14 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) { /// are zero. bool ARMConstantIslands::removeUnusedCPEntries() { unsigned MadeChange = false; - for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) { - std::vector<CPEntry> &CPEs = CPEntries[i]; - for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) { - if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) { - removeDeadCPEMI(CPEs[j].CPEMI); - CPEs[j].CPEMI = nullptr; - MadeChange = true; - } + for (std::vector<CPEntry> &CPEs : CPEntries) { + for (CPEntry &CPE : CPEs) { + if (CPE.RefCount == 0 && CPE.CPEMI) { + removeDeadCPEMI(CPE.CPEMI); + CPE.CPEMI = nullptr; + MadeChange = true; } + } } return MadeChange; } @@ -1829,8 +1826,7 @@ bool ARMConstantIslands::optimizeThumb2Instructions() { bool MadeChange = false; // Shrink ADR and LDR from constantpool. - for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) { - CPUser &U = CPUsers[i]; + for (CPUser &U : CPUsers) { unsigned Opcode = U.MI->getOpcode(); unsigned NewOpc = 0; unsigned Scale = 1; diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 7a35f252b22a..fa244786a80d 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -2160,6 +2160,11 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, return true; } case ARM::tBXNS_RET: { + // For v8.0-M.Main we need to authenticate LR before clearing FPRs, which + // uses R12 as a scratch register. + if (!STI->hasV8_1MMainlineOps() && AFI->shouldSignReturnAddress()) + BuildMI(MBB, MBBI, DebugLoc(), TII->get(ARM::t2AUT)); + MachineBasicBlock &AfterBB = CMSEClearFPRegs(MBB, MBBI); if (STI->hasV8_1MMainlineOps()) { @@ -2169,6 +2174,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, .addReg(ARM::SP) .addImm(4) .add(predOps(ARMCC::AL)); + + if (AFI->shouldSignReturnAddress()) + BuildMI(AfterBB, AfterBB.end(), DebugLoc(), TII->get(ARM::t2AUT)); } // Clear all GPR that are not a use of the return instruction. @@ -3073,6 +3081,22 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MI.eraseFromParent(); return true; } + case ARM::t2CALL_BTI: { + MachineFunction &MF = *MI.getMF(); + MachineInstrBuilder MIB = + BuildMI(MF, MI.getDebugLoc(), TII->get(ARM::tBL)); + MIB.cloneMemRefs(MI); + for (unsigned i = 0; i < MI.getNumOperands(); ++i) + MIB.add(MI.getOperand(i)); + if (MI.isCandidateForCallSiteEntry()) + MF.moveCallSiteInfo(&MI, MIB.getInstr()); + MIBundleBuilder Bundler(MBB, MI); + Bundler.append(MIB); + Bundler.append(BuildMI(MF, MI.getDebugLoc(), TII->get(ARM::t2BTI))); + finalizeBundle(MBB, Bundler.begin(), Bundler.end()); + MI.eraseFromParent(); + return true; + } case ARM::LOADDUAL: case ARM::STOREDUAL: { Register PairReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index b866cf952ff1..4b59f9cb94ce 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -503,20 +503,12 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, StackAdjustingInsts DefCFAOffsetCandidates; bool HasFP = hasFP(MF); - // Allocate the vararg register save area. - if (ArgRegsSaveSize) { - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize, - MachineInstr::FrameSetup); - DefCFAOffsetCandidates.addInst(std::prev(MBBI), ArgRegsSaveSize, true); - } - if (!AFI->hasStackFrame() && (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) { - if (NumBytes - ArgRegsSaveSize != 0) { - emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize), + if (NumBytes != 0) { + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, MachineInstr::FrameSetup); - DefCFAOffsetCandidates.addInst(std::prev(MBBI), - NumBytes - ArgRegsSaveSize, true); + DefCFAOffsetCandidates.addInst(std::prev(MBBI), NumBytes, true); } DefCFAOffsetCandidates.emitDefCFAOffsets(MBB, dl, TII, HasFP); return; @@ -562,13 +554,26 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, } } - // Move past FPCXT area. MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push; + + // Move past the PAC computation. + if (AFI->shouldSignReturnAddress()) + LastPush = MBBI++; + + // Move past FPCXT area. if (FPCXTSaveSize > 0) { LastPush = MBBI++; DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true); } + // Allocate the vararg register save area. + if (ArgRegsSaveSize) { + emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize, + MachineInstr::FrameSetup); + LastPush = std::prev(MBBI); + DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, true); + } + // Move past area 1. if (GPRCS1Size > 0) { GPRCS1Push = LastPush = MBBI++; @@ -788,7 +793,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, case ARM::R11: case ARM::R12: if (STI.splitFramePushPop(MF)) { - unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned DwarfReg = MRI->getDwarfRegNum( + Reg == ARM::R12 ? (unsigned)ARM::RA_AUTH_CODE : Reg, true); unsigned Offset = MFI.getObjectOffset(FI); unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); @@ -923,8 +929,9 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); if (!AFI->hasStackFrame()) { - if (NumBytes - ReservedArgStack != 0) - emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ReservedArgStack, + if (NumBytes + IncomingArgStackToRestore != 0) + emitSPUpdate(isARM, MBB, MBBI, dl, TII, + NumBytes + IncomingArgStackToRestore, MachineInstr::FrameDestroy); } else { // Unwind MBBI to point to first LDR / VLDRD. @@ -1007,15 +1014,21 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (AFI->getGPRCalleeSavedArea2Size()) MBBI++; if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; - if (AFI->getFPCXTSaveAreaSize()) MBBI++; - } - if (ReservedArgStack || IncomingArgStackToRestore) { - assert((int)ReservedArgStack + IncomingArgStackToRestore >= 0 && - "attempting to restore negative stack amount"); - emitSPUpdate(isARM, MBB, MBBI, dl, TII, - ReservedArgStack + IncomingArgStackToRestore, - MachineInstr::FrameDestroy); + if (ReservedArgStack || IncomingArgStackToRestore) { + assert((int)ReservedArgStack + IncomingArgStackToRestore >= 0 && + "attempting to restore negative stack amount"); + emitSPUpdate(isARM, MBB, MBBI, dl, TII, + ReservedArgStack + IncomingArgStackToRestore, + MachineInstr::FrameDestroy); + } + + // Validate PAC, It should have been already popped into R12. For CMSE entry + // function, the validation instruction is emitted during expansion of the + // tBXNS_RET, since the validation must use the value of SP at function + // entry, before saving, resp. after restoring, FPCXTNS. + if (AFI->shouldSignReturnAddress() && !AFI->isCmseNSEntryFunction()) + BuildMI(MBB, MBBI, DebugLoc(), STI.getInstrInfo()->get(ARM::t2AUT)); } } @@ -1199,6 +1212,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const TargetRegisterInfo &TRI = *STI.getRegisterInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + bool hasPAC = AFI->shouldSignReturnAddress(); DebugLoc DL; bool isTailCall = false; bool isInterrupt = false; @@ -1231,7 +1245,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, continue; if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && !isCmseEntry && !isTrap && AFI->getArgumentStackToRestore() == 0 && - STI.hasV5TOps() && MBB.succ_empty()) { + STI.hasV5TOps() && MBB.succ_empty() && !hasPAC) { Reg = ARM::PC; // Fold the return instruction into the LDM. DeleteRet = true; @@ -1580,6 +1594,11 @@ bool ARMFrameLowering::spillCalleeSavedRegisters( ARM::t2STR_PRE : ARM::STR_PRE_IMM; unsigned FltOpc = ARM::VSTMDDB_UPD; unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs(); + // Compute PAC in R12. + if (AFI->shouldSignReturnAddress()) { + BuildMI(MBB, MI, DebugLoc(), STI.getInstrInfo()->get(ARM::t2PAC)) + .setMIFlags(MachineInstr::FrameSetup); + } // Save the non-secure floating point context. if (llvm::any_of(CSI, [](const CalleeSavedInfo &C) { return C.getReg() == ARM::FPCXTNS; @@ -1789,6 +1808,13 @@ bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { MF.getInfo<ARMFunctionInfo>()->isCmseNSEntryFunction()) return false; + // We are disabling shrinkwrapping for now when PAC is enabled, as + // shrinkwrapping can cause clobbering of r12 when the PAC code is + // generated. A follow-up patch will fix this in a more performant manner. + if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress( + false /*SpillsLR */)) + return false; + return true; } @@ -2315,6 +2341,26 @@ bool ARMFrameLowering::assignCalleeSavedSpillSlots( CSI.back().setRestored(false); } + // For functions, which sign their return address, upon function entry, the + // return address PAC is computed in R12. Treat R12 as a callee-saved register + // in this case. + const auto &AFI = *MF.getInfo<ARMFunctionInfo>(); + if (AFI.shouldSignReturnAddress()) { + // The order of register must match the order we push them, because the + // PEI assigns frame indices in that order. When compiling for return + // address sign and authenication, we use split push, therefore the orders + // we want are: + // LR, R7, R6, R5, R4, <R12>, R11, R10, R9, R8, D15-D8 + CSI.insert(find_if(CSI, + [=](const auto &CS) { + unsigned Reg = CS.getReg(); + return Reg == ARM::R10 || Reg == ARM::R11 || + Reg == ARM::R8 || Reg == ARM::R9 || + ARM::DPRRegClass.contains(Reg); + }), + CalleeSavedInfo(ARM::R12)); + } + return false; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 33d115945614..3d45db349644 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -391,6 +391,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); } setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); @@ -428,7 +429,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { } // Predicate types - const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; + const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1}; for (auto VT : pTypes) { addRegisterClass(VT, &ARM::VCCRRegClass); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -445,6 +446,16 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); } + setOperationAction(ISD::SETCC, MVT::v2i1, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Expand); + setOperationAction(ISD::AND, MVT::v2i1, Expand); + setOperationAction(ISD::OR, MVT::v2i1, Expand); + setOperationAction(ISD::XOR, MVT::v2i1, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); @@ -1647,6 +1658,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(ARMISD::CALL_PRED) MAKE_CASE(ARMISD::CALL_NOLINK) MAKE_CASE(ARMISD::tSECALL) + MAKE_CASE(ARMISD::t2CALL_BTI) MAKE_CASE(ARMISD::BRCOND) MAKE_CASE(ARMISD::BR_JT) MAKE_CASE(ARMISD::BR2_JT) @@ -1853,8 +1865,10 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, // MVE has a predicate register. if ((Subtarget->hasMVEIntegerOps() && - (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) || - (Subtarget->hasMVEFloatOps() && (VT == MVT::v4f32 || VT == MVT::v8f16))) + (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + VT == MVT::v16i8)) || + (Subtarget->hasMVEFloatOps() && + (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16))) return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); return VT.changeVectorElementTypeToInteger(); } @@ -2308,6 +2322,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isCmseNSCall = false; bool isSibCall = false; bool PreferIndirect = false; + bool GuardWithBTI = false; + + // Lower 'returns_twice' calls to a pseudo-instruction. + if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) && + !Subtarget->getNoBTIAtReturnTwice()) + GuardWithBTI = AFI->branchTargetEnforcement(); // Determine whether this is a non-secure function call. if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call")) @@ -2713,7 +2733,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // FIXME: handle tail calls differently. unsigned CallOpc; if (Subtarget->isThumb()) { - if (isCmseNSCall) + if (GuardWithBTI) + CallOpc = ARMISD::t2CALL_BTI; + else if (isCmseNSCall) CallOpc = ARMISD::tSECALL; else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -2930,9 +2952,17 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization( // Indirect tail calls cannot be optimized for Thumb1 if the args // to the call take up r0-r3. The reason is that there are no legal registers // left to hold the pointer to the function to be called. - if (Subtarget->isThumb1Only() && Outs.size() >= 4 && - (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) - return false; + // Similarly, if the function uses return address sign and authentication, + // r12 is needed to hold the PAC and is not available to hold the callee + // address. + if (Outs.size() >= 4 && + (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) { + if (Subtarget->isThumb1Only()) + return false; + // Conservatively assume the function spills LR. + if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)) + return false; + } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -7616,7 +7646,10 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, unsigned NumElts = VT.getVectorNumElements(); unsigned BoolMask; unsigned BitsPerBool; - if (NumElts == 4) { + if (NumElts == 2) { + BitsPerBool = 8; + BoolMask = 0xff; + } else if (NumElts == 4) { BitsPerBool = 4; BoolMask = 0xf; } else if (NumElts == 8) { @@ -7699,6 +7732,46 @@ static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, DAG.getConstant(N, DL, MVT::i32)); } +// Returns true if the operation N can be treated as qr instruction variant at +// operand Op. +static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) { + switch (N->getOpcode()) { + case ISD::ADD: + case ISD::MUL: + case ISD::SADDSAT: + case ISD::UADDSAT: + return true; + case ISD::SUB: + case ISD::SSUBSAT: + case ISD::USUBSAT: + return N->getOperand(1).getNode() == Op; + case ISD::INTRINSIC_WO_CHAIN: + switch (N->getConstantOperandVal(0)) { + case Intrinsic::arm_mve_add_predicated: + case Intrinsic::arm_mve_mul_predicated: + case Intrinsic::arm_mve_qadd_predicated: + case Intrinsic::arm_mve_vhadd: + case Intrinsic::arm_mve_hadd_predicated: + case Intrinsic::arm_mve_vqdmulh: + case Intrinsic::arm_mve_qdmulh_predicated: + case Intrinsic::arm_mve_vqrdmulh: + case Intrinsic::arm_mve_qrdmulh_predicated: + case Intrinsic::arm_mve_vqdmull: + case Intrinsic::arm_mve_vqdmull_predicated: + return true; + case Intrinsic::arm_mve_sub_predicated: + case Intrinsic::arm_mve_qsub_predicated: + case Intrinsic::arm_mve_vhsub: + case Intrinsic::arm_mve_hsub_predicated: + return N->getOperand(2).getNode() == Op; + default: + return false; + } + default: + return false; + } +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, @@ -7720,6 +7793,20 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (SplatUndef.isAllOnes()) return DAG.getUNDEF(VT); + // If all the users of this constant splat are qr instruction variants, + // generate a vdup of the constant. + if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize && + (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) && + all_of(BVN->uses(), + [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) { + EVT DupVT = SplatBitSize == 32 ? MVT::v4i32 + : SplatBitSize == 16 ? MVT::v8i16 + : MVT::v16i8; + SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32); + SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const); + return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup); + } + if ((ST->hasNEON() && SplatBitSize <= 64) || (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { // Check if an immediate VMOV works. @@ -8313,9 +8400,8 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, SDLoc DL(Op); SmallVector<SDValue, 8> VTBLMask; - for (ArrayRef<int>::iterator - I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) - VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); + for (int I : ShuffleMask) + VTBLMask.push_back(DAG.getConstant(I, DL, MVT::i32)); if (V2.getNode()->isUndef()) return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, @@ -8346,6 +8432,8 @@ static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { static EVT getVectorTyFromPredicateVector(EVT VT) { switch (VT.getSimpleVT().SimpleTy) { + case MVT::v2i1: + return MVT::v2f64; case MVT::v4i1: return MVT::v4i32; case MVT::v8i1: @@ -8427,7 +8515,14 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, DAG.getUNDEF(NewVT), ShuffleMask); // Now return the result of comparing the shuffled vector with zero, - // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 + // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s. + if (VT == MVT::v2i1) { + SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled); + SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); + } return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); } @@ -8927,8 +9022,15 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, ConVec = ExtractInto(NewV1, ConVec, j); ConVec = ExtractInto(NewV2, ConVec, j); - // Now return the result of comparing the subvector with zero, - // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. + // Now return the result of comparing the subvector with zero, which will + // generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we + // convert to a v4i1 compare to fill in the two halves of the i64 as i32s. + if (VT == MVT::v2i1) { + SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec); + SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); + } return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, DAG.getConstant(ARMCC::NE, dl, MVT::i32)); }; @@ -8993,6 +9095,22 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); + if (NumElts == 2) { + EVT SubVT = MVT::v4i32; + SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); + for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, + DAG.getIntPtrConstant(i, dl)); + SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, + DAG.getConstant(j, dl, MVT::i32)); + SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, + DAG.getConstant(j + 1, dl, MVT::i32)); + } + SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec, + DAG.getConstant(ARMCC::NE, dl, MVT::i32)); + return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp); + } + EVT SubVT = MVT::getVectorVT(ElType, NumElts); SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { @@ -9839,16 +9957,17 @@ void ARMTargetLowering::ExpandDIV_Windows( static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); EVT MemVT = LD->getMemoryVT(); - assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + MemVT == MVT::v16i1) && "Expected a predicate type!"); assert(MemVT == Op.getValueType()); assert(LD->getExtensionType() == ISD::NON_EXTLOAD && "Expected a non-extending load"); assert(LD->isUnindexed() && "Expected a unindexed load"); - // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit + // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We - // need to make sure that 8/4 bits are actually loaded into the correct + // need to make sure that 8/4/2 bits are actually loaded into the correct // place, which means loading the value and then shuffling the values into // the bottom bits of the predicate. // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect @@ -9895,14 +10014,15 @@ void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); EVT MemVT = ST->getMemoryVT(); - assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && + assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + MemVT == MVT::v16i1) && "Expected a predicate type!"); assert(MemVT == ST->getValue().getValueType()); assert(!ST->isTruncatingStore() && "Expected a non-extending store"); assert(ST->isUnindexed() && "Expected a unindexed store"); - // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits - // unset and a scalar store. + // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with + // top bits unset and a scalar store. SDLoc dl(Op); SDValue Build = ST->getValue(); if (MemVT != MVT::v16i1) { @@ -9953,7 +10073,7 @@ static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, {ST->getChain(), Lo, Hi, ST->getBasePtr()}, MemVT, ST->getMemOperand()); } else if (Subtarget->hasMVEIntegerOps() && - ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || + ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1))) { return LowerPredicateStore(Op, DAG); } @@ -10561,25 +10681,23 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // associated with. DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; unsigned MaxCSNum = 0; - for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; - ++BB) { - if (!BB->isEHPad()) continue; + for (MachineBasicBlock &BB : *MF) { + if (!BB.isEHPad()) + continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing // pad. - for (MachineBasicBlock::iterator - II = BB->begin(), IE = BB->end(); II != IE; ++II) { - if (!II->isEHLabel()) continue; + for (MachineInstr &II : BB) { + if (!II.isEHLabel()) + continue; - MCSymbol *Sym = II->getOperand(0).getMCSymbol(); + MCSymbol *Sym = II.getOperand(0).getMCSymbol(); if (!MF->hasCallSiteLandingPad(Sym)) continue; SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); - for (SmallVectorImpl<unsigned>::iterator - CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); - CSI != CSE; ++CSI) { - CallSiteNumToLPad[*CSI].push_back(&*BB); - MaxCSNum = std::max(MaxCSNum, *CSI); + for (unsigned Idx : CallSiteIdxs) { + CallSiteNumToLPad[Idx].push_back(&BB); + MaxCSNum = std::max(MaxCSNum, Idx); } break; } @@ -14002,8 +14120,8 @@ static SDValue PerformANDCombine(SDNode *N, EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 || - VT == MVT::v8i1 || VT == MVT::v16i1) + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 || + VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1) return SDValue(); APInt SplatBits, SplatUndef; @@ -14298,8 +14416,8 @@ static SDValue PerformORCombine(SDNode *N, if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - if (Subtarget->hasMVEIntegerOps() && - (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) + if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 || + VT == MVT::v8i1 || VT == MVT::v16i1)) return PerformORCombine_i1(N, DAG, Subtarget); APInt SplatBits, SplatUndef; @@ -14569,6 +14687,15 @@ static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) { if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1))) return SDValue(); SDValue CSInc = Cmp->getOperand(0); + + // Ignore any `And 1` nodes that may not yet have been removed. We are + // looking for a value that produces 1/0, so these have no effect on the + // code. + while (CSInc.getOpcode() == ISD::AND && + isa<ConstantSDNode>(CSInc.getOperand(1)) && + CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse()) + CSInc = CSInc.getOperand(0); + if (CSInc.getOpcode() != ARMISD::CSINC || !isNullConstant(CSInc.getOperand(0)) || !isNullConstant(CSInc.getOperand(1)) || !CSInc->hasOneUse()) @@ -17897,6 +18024,23 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { if (!VT.isInteger()) return SDValue(); + // Fold away an unneccessary CMPZ/CMOV + // CMOV A, B, C1, $cpsr, (CMPZ (CMOV 1, 0, C2, D), 0) -> + // if C1==EQ -> CMOV A, B, C2, $cpsr, D + // if C1==NE -> CMOV A, B, NOT(C2), $cpsr, D + if (N->getConstantOperandVal(2) == ARMCC::EQ || + N->getConstantOperandVal(2) == ARMCC::NE) { + ARMCC::CondCodes Cond; + if (SDValue C = IsCMPZCSINC(N->getOperand(4).getNode(), Cond)) { + if (N->getConstantOperandVal(2) == ARMCC::NE) + Cond = ARMCC::getOppositeCondition(Cond); + return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0), + N->getOperand(1), + DAG.getTargetConstant(Cond, SDLoc(N), MVT::i32), + N->getOperand(3), C); + } + } + // Materialize a boolean comparison for integers so we can avoid branching. if (isNullConstant(FalseVal)) { if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { @@ -18564,7 +18708,8 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, return false; // These are for predicates - if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { + if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 || + Ty == MVT::v2i1)) { if (Fast) *Fast = true; return true; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index e3b422358cae..1c5f8389f57c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -69,6 +69,7 @@ class VectorType; CALL_PRED, // Function call that's predicable. CALL_NOLINK, // Function call with branch not branch-and-link. tSECALL, // CMSE non-secure function call. + t2CALL_BTI, // Thumb function call followed by BTI instruction. BRCOND, // Conditional branch. BR_JT, // Jumptable branch. BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index f53814a80e01..1ae0354ffc37 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -254,13 +254,6 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, // An LLVM ValueType representing a corresponding vector of // predicate bits, for use in ISel patterns that handle an IR // intrinsic describing the predicated form of the instruction. - // - // Usually, for a vector of N things, this will be vNi1. But for - // vectors of 2 values, we make an exception, and use v4i1 instead - // of v2i1. Rationale: MVE codegen doesn't support doing all the - // auxiliary operations on v2i1 (vector shuffles etc), and also, - // there's no MVE compare instruction that will _generate_ v2i1 - // directly. ValueType Pred = pred; // Same as Pred but for DblVec rather than Vec. @@ -294,25 +287,25 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, // Integer vector types that don't treat signed and unsigned differently. def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "i", ?>; def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "i", ?>; -def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "i", ?>; -def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "i", ?>; +def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v2i1, 0b10, "i", ?>; +def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?, v2i1, ?, 0b11, "i", ?>; // Explicitly signed and unsigned integer vectors. They map to the // same set of LLVM ValueTypes as above, but are represented // differently in assembly and instruction encodings. def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "s", 0b0>; def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "s", 0b0>; -def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "s", 0b0>; -def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "s", 0b0>; +def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v2i1, 0b10, "s", 0b0>; +def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?, v2i1, ?, 0b11, "s", 0b0>; def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "u", 0b1>; def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1, v4i1, 0b01, "u", 0b1>; -def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v4i1, 0b10, "u", 0b1>; -def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v4i1, ?, 0b11, "u", 0b1>; +def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1, v2i1, 0b10, "u", 0b1>; +def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?, v2i1, ?, 0b11, "u", 0b1>; // FP vector types. def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1, v4i1, 0b01, "f", ?>; -def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, v4i1, 0b10, "f", ?>; -def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v4i1, ?, 0b11, "f", ?>; +def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1, v2i1, 0b10, "f", ?>; +def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?, v2i1, ?, 0b11, "f", ?>; // Polynomial vector types. def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>; @@ -2260,6 +2253,31 @@ let Predicates = [HasMVEInt] in { (v4i32 (ARMvmovImm (i32 1)))), (i32 1))), (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; + + def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)), + (v16i8 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)), + (v8i16 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>; + def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)), + (v4i32 (ARMvdup (i32 1)))), + (i32 1))), + (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>; } @@ -4450,6 +4468,11 @@ multiclass two_predops<SDPatternOperator opnode, Instruction insn> { (insn (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p1), rGPR)), (i32 (COPY_TO_REGCLASS (v4i1 VCCR:$p2), rGPR))), VCCR))>; + def v2i1 : Pat<(v2i1 (opnode (v2i1 VCCR:$p1), (v2i1 VCCR:$p2))), + (v2i1 (COPY_TO_REGCLASS + (insn (i32 (COPY_TO_REGCLASS (v2i1 VCCR:$p1), rGPR)), + (i32 (COPY_TO_REGCLASS (v2i1 VCCR:$p2), rGPR))), + VCCR))>; } let Predicates = [HasMVEInt] in { @@ -4469,20 +4492,20 @@ def load_align4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ }]>; let Predicates = [HasMVEInt] in { - foreach VT = [ v4i1, v8i1, v16i1 ] in { + foreach VT = [ v2i1, v4i1, v8i1, v16i1 ] in { def : Pat<(i32 (predicate_cast (VT VCCR:$src))), (i32 (COPY_TO_REGCLASS (VT VCCR:$src), VCCR))>; def : Pat<(VT (predicate_cast (i32 VCCR:$src))), (VT (COPY_TO_REGCLASS (i32 VCCR:$src), VCCR))>; - foreach VT2 = [ v4i1, v8i1, v16i1 ] in + foreach VT2 = [ v2i1, v4i1, v8i1, v16i1 ] in def : Pat<(VT (predicate_cast (VT2 VCCR:$src))), (VT (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>; } // If we happen to be casting from a load we can convert that straight // into a predicate load, so long as the load is of the correct type. - foreach VT = [ v4i1, v8i1, v16i1 ] in { + foreach VT = [ v2i1, v4i1, v8i1, v16i1 ] in { def : Pat<(VT (predicate_cast (i32 (load_align4 taddrmode_imm7<2>:$addr)))), (VT (VLDR_P0_off taddrmode_imm7<2>:$addr))>; } @@ -5350,33 +5373,40 @@ class MVE_VxADDSUB_qr<string iname, string suffix, } multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract, - Intrinsic unpred_int, Intrinsic pred_int> { + Intrinsic unpred_int, Intrinsic pred_int, PatFrag add_op, + SDNode shift_op> { def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract, VTI.Size>; defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI, unpred_int, pred_int, 1, 1>; + defvar Inst = !cast<Instruction>(NAME); + + let Predicates = [HasMVEInt] in { + def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn))), (i32 1))), + (Inst MQPR:$Qm, rGPR:$Rn)>; + } } -multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI> : - MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, - int_arm_mve_hadd_predicated>; +multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op> : + MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, + add_op, shift_op>; -multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI> : - MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, - int_arm_mve_hsub_predicated>; +multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op> : + MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub, int_arm_mve_hsub_predicated, + add_op, shift_op>; -defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m<MVE_v16s8>; -defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16>; -defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32>; -defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m<MVE_v16u8>; -defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16>; -defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32>; +defm MVE_VHADD_qr_s8 : MVE_VHADD_qr_m<MVE_v16s8, addnsw, ARMvshrsImm>; +defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16, addnsw, ARMvshrsImm>; +defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32, addnsw, ARMvshrsImm>; +defm MVE_VHADD_qr_u8 : MVE_VHADD_qr_m<MVE_v16u8, addnuw, ARMvshruImm>; +defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16, addnuw, ARMvshruImm>; +defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32, addnuw, ARMvshruImm>; -defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m<MVE_v16s8>; -defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16>; -defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32>; -defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m<MVE_v16u8>; -defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>; -defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>; +defm MVE_VHSUB_qr_s8 : MVE_VHSUB_qr_m<MVE_v16s8, subnsw, ARMvshrsImm>; +defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16, subnsw, ARMvshrsImm>; +defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32, subnsw, ARMvshrsImm>; +defm MVE_VHSUB_qr_u8 : MVE_VHSUB_qr_m<MVE_v16u8, subnuw, ARMvshruImm>; +defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16, subnuw, ARMvshruImm>; +defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32, subnuw, ARMvshruImm>; multiclass MVE_VADDSUB_qr_f<string iname, MVEVectorVTInfo VTI, bit subtract, SDNode Op, Intrinsic PredInt, SDPatternOperator IdentityVec> { @@ -6778,11 +6808,15 @@ let Predicates = [HasMVEInt] in { (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2i64 (vselect (v2i1 VCCR:$pred), (v2i64 MQPR:$v1), (v2i64 MQPR:$v2))), + (v2i64 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))), (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))), (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2f64 (vselect (v2i1 VCCR:$pred), (v2f64 MQPR:$v1), (v2f64 MQPR:$v2))), + (v2f64 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, @@ -6808,6 +6842,8 @@ let Predicates = [HasMVEInt] in { (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4i32 (zext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2i64 (zext (v2i1 VCCR:$pred))), + (v2i64 (MVE_VPSEL (MVE_VMOVimmi64 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v16i8 (sext (v16i1 VCCR:$pred))), (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>; @@ -6815,6 +6851,8 @@ let Predicates = [HasMVEInt] in { (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4i32 (sext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2i64 (sext (v2i1 VCCR:$pred))), + (v2i64 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v16i8 (anyext (v16i1 VCCR:$pred))), (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred, zero_reg))>; @@ -6822,6 +6860,8 @@ let Predicates = [HasMVEInt] in { (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred, zero_reg))>; def : Pat<(v4i32 (anyext (v4i1 VCCR:$pred))), (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; + def : Pat<(v2i64 (anyext (v2i1 VCCR:$pred))), + (v2i64 (MVE_VPSEL (MVE_VMOVimmi64 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred, zero_reg))>; } let Predicates = [HasMVEFloat] in { @@ -6862,6 +6902,8 @@ def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary, } let Predicates = [HasMVEInt] in { + def : Pat<(v2i1 (xor (v2i1 VCCR:$pred), (v2i1 (predicate_cast (i32 65535))))), + (v2i1 (MVE_VPNOT (v2i1 VCCR:$pred)))>; def : Pat<(v4i1 (xor (v4i1 VCCR:$pred), (v4i1 (predicate_cast (i32 65535))))), (v4i1 (MVE_VPNOT (v4i1 VCCR:$pred)))>; def : Pat<(v8i1 (xor (v8i1 VCCR:$pred), (v8i1 (predicate_cast (i32 65535))))), diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index 4471317f4ea4..6e8e61ca2b8e 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5736,3 +5736,10 @@ def t2BTI : PACBTIHintSpaceNoOpsInst<"bti", 0b00001111>; def t2AUT : PACBTIHintSpaceUseInst<"aut", 0b00101101> { let hasSideEffects = 1; } + +def ARMt2CallBTI : SDNode<"ARMISD::t2CALL_BTI", SDT_ARMcall, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; + +def t2CALL_BTI : PseudoInst<(outs), (ins pred:$p, thumb_bl_target:$func), + IIC_Br, [(ARMt2CallBTI tglobaladdr:$func)]>, + Requires<[IsThumb2]>, Sched<[WriteBrL]>; diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td index 9d1bfa414dff..dc5f1b92a6c2 100644 --- a/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -1076,6 +1076,9 @@ multiclass vrint_inst_anpm<string opc, bits<2> rm, } } + def : InstAlias<!strconcat("vrint", opc, ".f16.f16\t$Sd, $Sm"), + (!cast<Instruction>(NAME#"H") HPR:$Sd, HPR:$Sm), 0>, + Requires<[HasFullFP16]>; def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"), (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm), 0>, Requires<[HasFPARMv8]>; diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 3b10c60a0654..ef5fc12feb54 100644 --- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2121,7 +2121,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { bool Modified = false; for (MachineBasicBlock &MBB : Fn) { Modified |= LoadStoreMultipleOpti(MBB); - if (STI->hasV5TOps()) + if (STI->hasV5TOps() && !AFI->shouldSignReturnAddress()) Modified |= MergeReturnIntoLDM(MBB); if (isThumb1) Modified |= CombineMovBx(MBB); @@ -2349,9 +2349,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB, unsigned LastOpcode = 0; unsigned LastBytes = 0; unsigned NumMove = 0; - for (int i = Ops.size() - 1; i >= 0; --i) { + for (MachineInstr *Op : llvm::reverse(Ops)) { // Make sure each operation has the same kind. - MachineInstr *Op = Ops[i]; unsigned LSMOpcode = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia); if (LastOpcode && LSMOpcode != LastOpcode) diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 3874db5792d6..f822672c4477 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -251,10 +251,7 @@ namespace { SetVector<MachineInstr *> &Predicates = PredicatedInsts[MI]->Predicates; if (Exclusive && Predicates.size() != 1) return false; - for (auto *PredMI : Predicates) - if (isVCTP(PredMI)) - return true; - return false; + return llvm::any_of(Predicates, isVCTP); } // Is the VPST, controlling the block entry, predicated upon a VCTP. @@ -351,10 +348,7 @@ namespace { } bool containsVCTP() const { - for (auto *MI : Insts) - if (isVCTP(MI)) - return true; - return false; + return llvm::any_of(Insts, isVCTP); } unsigned size() const { return Insts.size(); } @@ -1334,8 +1328,8 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { bool Changed = false; // Process inner loops first. - for (auto I = ML->begin(), E = ML->end(); I != E; ++I) - Changed |= ProcessLoop(*I); + for (MachineLoop *L : *ML) + Changed |= ProcessLoop(L); LLVM_DEBUG({ dbgs() << "ARM Loops: Processing loop containing:\n"; @@ -1699,7 +1693,7 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) { // If any of the instructions between the VCMP and VPST are predicated // then a different code path is expected to have merged the VCMP and // VPST already. - if (!std::any_of(++MachineBasicBlock::iterator(VCMP), + if (std::none_of(++MachineBasicBlock::iterator(VCMP), MachineBasicBlock::iterator(VPST), hasVPRUse) && RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(1).getReg()) && RDA->hasSameReachingDef(VCMP, VPST, VCMP->getOperand(2).getReg())) { diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 4077fc058217..d8d937055d23 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -289,7 +289,7 @@ public: return false; if (SignReturnAddressAll) return true; - return LRSpilled; + return SpillsLR; } bool branchTargetEnforcement() const { return BranchTargetEnforcement; } diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td index 760a5a5a20cf..194d65cad8d1 100644 --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -211,6 +211,8 @@ def FPCXTS : ARMReg<15, "fpcxts">; def ZR : ARMReg<15, "zr">, DwarfRegNum<[15]>; +def RA_AUTH_CODE : ARMReg<12, "ra_auth_code">, DwarfRegNum<[143]>; + // Register classes. // // pc == Program Counter @@ -395,7 +397,7 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { } // MVE Condition code register. -def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1], 32, (add VPR)> { +def VCCR : RegisterClass<"ARM", [i32, v16i1, v8i1, v4i1, v2i1], 32, (add VPR)> { // let CopyCost = -1; // Don't allow copying of status registers. } diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index d51a888c951f..e61b90af31b0 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -18,6 +18,7 @@ #include "ARMConstantPoolValue.h" #include "ARMFrameLowering.h" #include "ARMISelLowering.h" +#include "ARMMachineFunctionInfo.h" #include "ARMSelectionDAGInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -534,6 +535,10 @@ protected: /// Selected instruction itineraries (one entry per itinerary class.) InstrItineraryData InstrItins; + /// NoBTIAtReturnTwice - Don't place a BTI instruction after + /// return-twice constructs (setjmp) + bool NoBTIAtReturnTwice = false; + /// Options passed via command line that could influence the target const TargetOptions &Options; @@ -840,6 +845,8 @@ public: /// to lr. This is always required on Thumb1-only targets, as the push and /// pop instructions can't access the high registers. bool splitFramePushPop(const MachineFunction &MF) const { + if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress()) + return true; return (getFramePointerReg() == ARM::R7 && MF.getTarget().Options.DisableFramePointerElim(MF)) || isThumb1Only(); @@ -948,6 +955,8 @@ public: bool hardenSlsRetBr() const { return HardenSlsRetBr; } bool hardenSlsBlr() const { return HardenSlsBlr; } bool hardenSlsNoComdat() const { return HardenSlsNoComdat; } + + bool getNoBTIAtReturnTwice() const { return NoBTIAtReturnTwice; } }; } // end namespace llvm diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 39f407ba7149..bfe078b06861 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -137,21 +137,18 @@ public: int getFPReg() const { return FPReg; } void emitFnStartLocNotes() const { - for (Locs::const_iterator FI = FnStartLocs.begin(), FE = FnStartLocs.end(); - FI != FE; ++FI) - Parser.Note(*FI, ".fnstart was specified here"); + for (const SMLoc &Loc : FnStartLocs) + Parser.Note(Loc, ".fnstart was specified here"); } void emitCantUnwindLocNotes() const { - for (Locs::const_iterator UI = CantUnwindLocs.begin(), - UE = CantUnwindLocs.end(); UI != UE; ++UI) - Parser.Note(*UI, ".cantunwind was specified here"); + for (const SMLoc &Loc : CantUnwindLocs) + Parser.Note(Loc, ".cantunwind was specified here"); } void emitHandlerDataLocNotes() const { - for (Locs::const_iterator HI = HandlerDataLocs.begin(), - HE = HandlerDataLocs.end(); HI != HE; ++HI) - Parser.Note(*HI, ".handlerdata was specified here"); + for (const SMLoc &Loc : HandlerDataLocs) + Parser.Note(Loc, ".handlerdata was specified here"); } void emitPersonalityLocNotes() const { @@ -452,7 +449,8 @@ class ARMAsmParser : public MCTargetAsmParser { int tryParseRegister(); bool tryParseRegisterWithWriteBack(OperandVector &); int tryParseShiftRegister(OperandVector &); - bool parseRegisterList(OperandVector &, bool EnforceOrder = true); + bool parseRegisterList(OperandVector &, bool EnforceOrder = true, + bool AllowRAAC = false); bool parseMemory(OperandVector &); bool parseOperand(OperandVector &, StringRef Mnemonic); bool parsePrefix(ARMMCExpr::VariantKind &RefKind); @@ -2572,17 +2570,15 @@ public: void addRegListOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const SmallVectorImpl<unsigned> &RegList = getRegList(); - for (SmallVectorImpl<unsigned>::const_iterator - I = RegList.begin(), E = RegList.end(); I != E; ++I) - Inst.addOperand(MCOperand::createReg(*I)); + for (unsigned Reg : RegList) + Inst.addOperand(MCOperand::createReg(Reg)); } void addRegListWithAPSROperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const SmallVectorImpl<unsigned> &RegList = getRegList(); - for (SmallVectorImpl<unsigned>::const_iterator - I = RegList.begin(), E = RegList.end(); I != E; ++I) - Inst.addOperand(MCOperand::createReg(*I)); + for (unsigned Reg : RegList) + Inst.addOperand(MCOperand::createReg(Reg)); } void addDPRRegListOperands(MCInst &Inst, unsigned N) const { @@ -4464,8 +4460,8 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs, } /// Parse a register list. -bool ARMAsmParser::parseRegisterList(OperandVector &Operands, - bool EnforceOrder) { +bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder, + bool AllowRAAC) { MCAsmParser &Parser = getParser(); if (Parser.getTok().isNot(AsmToken::LCurly)) return TokError("Token is not a Left Curly Brace"); @@ -4478,7 +4474,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, int Reg = tryParseRegister(); if (Reg == -1) return Error(RegLoc, "register expected"); - + if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE) + return Error(RegLoc, "pseudo-register not allowed"); // The reglist instructions have at most 16 registers, so reserve // space for that many. int EReg = 0; @@ -4492,7 +4489,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, ++Reg; } const MCRegisterClass *RC; - if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + if (Reg == ARM::RA_AUTH_CODE || + ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) RC = &ARMMCRegisterClasses[ARM::GPRRegClassID]; else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) RC = &ARMMCRegisterClasses[ARM::DPRRegClassID]; @@ -4513,11 +4511,15 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, while (Parser.getTok().is(AsmToken::Comma) || Parser.getTok().is(AsmToken::Minus)) { if (Parser.getTok().is(AsmToken::Minus)) { + if (Reg == ARM::RA_AUTH_CODE) + return Error(RegLoc, "pseudo-register not allowed"); Parser.Lex(); // Eat the minus. SMLoc AfterMinusLoc = Parser.getTok().getLoc(); int EndReg = tryParseRegister(); if (EndReg == -1) return Error(AfterMinusLoc, "register expected"); + if (EndReg == ARM::RA_AUTH_CODE) + return Error(AfterMinusLoc, "pseudo-register not allowed"); // Allow Q regs and just interpret them as the two D sub-registers. if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg)) EndReg = getDRegFromQReg(EndReg) + 1; @@ -4526,7 +4528,9 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, if (Reg == EndReg) continue; // The register must be in the same register class as the first. - if (!RC->contains(EndReg)) + if ((Reg == ARM::RA_AUTH_CODE && + RC != &ARMMCRegisterClasses[ARM::GPRRegClassID]) || + (Reg != ARM::RA_AUTH_CODE && !RC->contains(Reg))) return Error(AfterMinusLoc, "invalid register in register list"); // Ranges must go from low to high. if (MRI->getEncodingValue(Reg) > MRI->getEncodingValue(EndReg)) @@ -4551,13 +4555,15 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, Reg = tryParseRegister(); if (Reg == -1) return Error(RegLoc, "register expected"); + if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE) + return Error(RegLoc, "pseudo-register not allowed"); // Allow Q regs and just interpret them as the two D sub-registers. bool isQReg = false; if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) { Reg = getDRegFromQReg(Reg); isQReg = true; } - if (!RC->contains(Reg) && + if (Reg != ARM::RA_AUTH_CODE && !RC->contains(Reg) && RC->getID() == ARMMCRegisterClasses[ARM::GPRRegClassID].getID() && ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg)) { // switch the register classes, as GPRwithAPSRnospRegClassID is a partial @@ -4577,7 +4583,9 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, continue; } // The register must be in the same register class as the first. - if (!RC->contains(Reg)) + if ((Reg == ARM::RA_AUTH_CODE && + RC != &ARMMCRegisterClasses[ARM::GPRRegClassID]) || + (Reg != ARM::RA_AUTH_CODE && !RC->contains(Reg))) return Error(RegLoc, "invalid register in register list"); // In most cases, the list must be monotonically increasing. An // exception is CLRM, which is order-independent anyway, so @@ -7106,13 +7114,12 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, return Error(Loc, "too many conditions on VPT instruction"); } unsigned Mask = 8; - for (unsigned i = ITMask.size(); i != 0; --i) { - char pos = ITMask[i - 1]; - if (pos != 't' && pos != 'e') { + for (char Pos : llvm::reverse(ITMask)) { + if (Pos != 't' && Pos != 'e') { return Error(Loc, "illegal IT block condition mask '" + ITMask + "'"); } Mask >>= 1; - if (ITMask[i - 1] == 'e') + if (Pos == 'e') Mask |= 8; } Operands.push_back(ARMOperand::CreateITMask(Mask, Loc)); @@ -11685,7 +11692,7 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) { SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands; // Parse the register list - if (parseRegisterList(Operands) || + if (parseRegisterList(Operands, true, true) || parseToken(AsmToken::EndOfStatement, "unexpected token in directive")) return true; ARMOperand &Op = (ARMOperand &)*Operands[0]; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index 896b104e8d97..e060e59e3759 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -1289,34 +1289,65 @@ void ARMELFStreamer::emitPad(int64_t Offset) { PendingOffset -= Offset; } -void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList, - bool IsVector) { - // Collect the registers in the register list - unsigned Count = 0; +static std::pair<unsigned, unsigned> +collectHWRegs(const MCRegisterInfo &MRI, unsigned Idx, + const SmallVectorImpl<unsigned> &RegList, bool IsVector, + uint32_t &Mask_) { uint32_t Mask = 0; - const MCRegisterInfo *MRI = getContext().getRegisterInfo(); - for (size_t i = 0; i < RegList.size(); ++i) { - unsigned Reg = MRI->getEncodingValue(RegList[i]); + unsigned Count = 0; + while (Idx > 0) { + unsigned Reg = RegList[Idx - 1]; + if (Reg == ARM::RA_AUTH_CODE) + break; + Reg = MRI.getEncodingValue(Reg); assert(Reg < (IsVector ? 32U : 16U) && "Register out of range"); unsigned Bit = (1u << Reg); if ((Mask & Bit) == 0) { Mask |= Bit; ++Count; } + --Idx; } - // Track the change the $sp offset: For the .save directive, the - // corresponding push instruction will decrease the $sp by (4 * Count). - // For the .vsave directive, the corresponding vpush instruction will - // decrease $sp by (8 * Count). - SPOffset -= Count * (IsVector ? 8 : 4); + Mask_ = Mask; + return {Idx, Count}; +} - // Emit the opcode - FlushPendingOffset(); - if (IsVector) - UnwindOpAsm.EmitVFPRegSave(Mask); - else - UnwindOpAsm.EmitRegSave(Mask); +void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList, + bool IsVector) { + uint32_t Mask; + unsigned Idx, Count; + const MCRegisterInfo &MRI = *getContext().getRegisterInfo(); + + // Collect the registers in the register list. Issue unwinding instructions in + // three parts: ordinary hardware registers, return address authentication + // code pseudo register, the rest of the registers. The RA PAC is kept in an + // architectural register (usually r12), but we treat it as a special case in + // order to distinguish between that register containing RA PAC or a general + // value. + Idx = RegList.size(); + while (Idx > 0) { + std::tie(Idx, Count) = collectHWRegs(MRI, Idx, RegList, IsVector, Mask); + if (Count) { + // Track the change the $sp offset: For the .save directive, the + // corresponding push instruction will decrease the $sp by (4 * Count). + // For the .vsave directive, the corresponding vpush instruction will + // decrease $sp by (8 * Count). + SPOffset -= Count * (IsVector ? 8 : 4); + + // Emit the opcode + FlushPendingOffset(); + if (IsVector) + UnwindOpAsm.EmitVFPRegSave(Mask); + else + UnwindOpAsm.EmitRegSave(Mask); + } else if (Idx > 0 && RegList[Idx - 1] == ARM::RA_AUTH_CODE) { + --Idx; + SPOffset -= 4; + FlushPendingOffset(); + UnwindOpAsm.EmitRegSave(0); + } + } } void ARMELFStreamer::emitUnwindRaw(int64_t Offset, diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp index 781627c3c425..50f416b23db2 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp @@ -64,8 +64,11 @@ namespace { } // end anonymous namespace void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) { - if (RegSave == 0u) + if (RegSave == 0u) { + // That's the special case for RA PAC. + EmitInt8(ARM::EHABI::UNWIND_OPCODE_POP_RA_AUTH_CODE); return; + } // One byte opcode to save register r14 and r11-r4 if (RegSave & (1u << 4)) { diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index dc58b5427425..7e31ea77f4f5 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -366,7 +366,7 @@ bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { while (!Worklist.empty()) { Register Reg = Worklist.pop_back_val(); for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { - if (count(ExpectedUsers, &MI)) + if (llvm::is_contained(ExpectedUsers, &MI)) continue; if (MI.getOpcode() != TargetOpcode::COPY || !MI.getOperand(0).getReg().isVirtual()) { diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index 6a5bc9284266..0e6960bce32b 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -213,7 +213,8 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, auto *TC = SE->getSCEV(TripCount); int VectorWidth = cast<FixedVectorType>(ActiveLaneMask->getType())->getNumElements(); - if (VectorWidth != 4 && VectorWidth != 8 && VectorWidth != 16) + if (VectorWidth != 2 && VectorWidth != 4 && VectorWidth != 8 && + VectorWidth != 16) return false; ConstantInt *ConstElemCount = nullptr; @@ -371,15 +372,10 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, switch (VectorWidth) { default: llvm_unreachable("unexpected number of lanes"); + case 2: VCTPID = Intrinsic::arm_mve_vctp64; break; case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; - - // FIXME: vctp64 currently not supported because the predicate - // vector wants to be <2 x i1>, but v2i1 is not a legal MVE - // type, so problems happen at isel time. - // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics - // purposes, but takes a v4i1 instead of a v2i1. } Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); Value *VCTPCall = Builder.CreateCall(VCTP, Processed); diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp index 224c61b9f065..54e80a095dd4 100644 --- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -824,8 +824,8 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters( ARMRegSet CopyRegs; // Registers which can be used after pushing // LoRegs for saving HiRegs. - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); + for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { + unsigned Reg = I.getReg(); if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) { LoRegsToSave[Reg] = true; @@ -1021,8 +1021,7 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters( BuildMI(MF, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL)); bool NeedsPop = false; - for (unsigned i = CSI.size(); i != 0; --i) { - CalleeSavedInfo &Info = CSI[i-1]; + for (CalleeSavedInfo &Info : llvm::reverse(CSI)) { unsigned Reg = Info.getReg(); // High registers (excluding lr) have already been dealt with @@ -1067,7 +1066,7 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters( if (NeedsPop) MBB.insert(MI, &*MIB); else - MF.DeleteMachineInstr(MIB); + MF.deleteMachineInstr(MIB); return true; } diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp index 672611ea2234..543d94875037 100644 --- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp +++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp @@ -247,8 +247,8 @@ bool AVRFrameLowering::spillCalleeSavedRegisters( const TargetInstrInfo &TII = *STI.getInstrInfo(); AVRMachineFunctionInfo *AVRFI = MF.getInfo<AVRMachineFunctionInfo>(); - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i - 1].getReg(); + for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { + unsigned Reg = I.getReg(); bool IsNotLiveIn = !MBB.isLiveIn(Reg); assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 && diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp index 798d08393eae..51060018a5ca 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp +++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp @@ -571,8 +571,6 @@ void AVRInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, // See lib/CodeGen/RegisterRelaxation.cpp for details. // We end up here when a jump is too long for a RJMP instruction. BuildMI(&MBB, DL, get(AVR::JMPk)).addMBB(&NewDestBB); - - return; } } // end of namespace llvm diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp index 0348e2200acb..36237b2fc4fd 100644 --- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp +++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp @@ -93,8 +93,13 @@ static bool BPFPreserveDITypeImpl(Function &F) { Ty = DTy->getBaseType(); } - if (Ty->getName().empty()) - report_fatal_error("Empty type name for BTF_TYPE_ID_REMOTE reloc"); + if (Ty->getName().empty()) { + if (isa<DISubroutineType>(Ty)) + report_fatal_error( + "SubroutineType not supported for BTF_TYPE_ID_REMOTE reloc"); + else + report_fatal_error("Empty type name for BTF_TYPE_ID_REMOTE reloc"); + } MD = Ty; } diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp index ebc04b40d428..29b99a84a6cd 100644 --- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp +++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp @@ -11,6 +11,7 @@ #include "MCTargetDesc/CSKYMCTargetDesc.h" #include "TargetInfo/CSKYTargetInfo.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/Register.h" #include "llvm/MC/MCContext.h" @@ -25,11 +26,24 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +using namespace llvm; + #define DEBUG_TYPE "csky-asm-parser" -using namespace llvm; +// Include the auto-generated portion of the compress emitter. +#define GEN_COMPRESS_INSTR +#include "CSKYGenCompressInstEmitter.inc" + +STATISTIC(CSKYNumInstrsCompressed, + "Number of C-SKY Compressed instructions emitted"); + +static cl::opt<bool> + EnableCompressedInst("enable-csky-asm-compressed-inst", cl::Hidden, + cl::init(false), + cl::desc("Enable C-SKY asm compressed instruction")); namespace { struct CSKYOperand; @@ -55,6 +69,10 @@ class CSKYAsmParser : public MCTargetAsmParser { bool ParseDirective(AsmToken DirectiveID) override; + // Helper to actually emit an instruction to the MCStreamer. Also, when + // possible, compression of the instruction is performed. + void emitToStreamer(MCStreamer &S, const MCInst &Inst); + OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; @@ -264,12 +282,6 @@ public: bool isConstpool() const { return isConstPoolOp(); } bool isDataSymbol() const { return isConstPoolOp(); } - bool isSPOperand() const { - if (!isReg()) - return false; - return getReg() == CSKY::R14; - } - bool isPSRFlag() const { int64_t Imm; // Must be of 'immediate' type and a constant. @@ -755,10 +767,6 @@ bool CSKYAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc(); return Error(ErrorLoc, "register is out of range"); } - case Match_InvalidSPOperand: { - SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc(); - return Error(ErrorLoc, "operand must be sp register"); - } case Match_RequiresSameSrcAndDst: { SMLoc ErrorLoc = ((CSKYOperand &)*Operands[ErrorInfo]).getStartLoc(); return Error(ErrorLoc, "src and dst operand must be same"); @@ -776,27 +784,62 @@ bool CSKYAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, OperandVector &Operands, MCStreamer &Out) { - if (Inst.getOpcode() == CSKY::LDQ32 || Inst.getOpcode() == CSKY::STQ32) { + switch (Inst.getOpcode()) { + default: + break; + case CSKY::LDQ32: + case CSKY::STQ32: if (Inst.getOperand(1).getReg() != CSKY::R4 || Inst.getOperand(2).getReg() != CSKY::R7) { return Error(IDLoc, "Register sequence is not valid. 'r4-r7' expected"); } Inst.setOpcode(Inst.getOpcode() == CSKY::LDQ32 ? CSKY::LDM32 : CSKY::STM32); - Out.emitInstruction(Inst, getSTI()); - return false; - } else if (Inst.getOpcode() == CSKY::SEXT32 || - Inst.getOpcode() == CSKY::ZEXT32) { + break; + case CSKY::SEXT32: + case CSKY::ZEXT32: if (Inst.getOperand(2).getImm() < Inst.getOperand(3).getImm()) return Error(IDLoc, "msb must be greater or equal to lsb"); - } else if (Inst.getOpcode() == CSKY::INS32) { + break; + case CSKY::INS32: if (Inst.getOperand(3).getImm() < Inst.getOperand(4).getImm()) return Error(IDLoc, "msb must be greater or equal to lsb"); - } else if (Inst.getOpcode() == CSKY::IDLY32) { + break; + case CSKY::IDLY32: if (Inst.getOperand(0).getImm() > 32 || Inst.getOperand(0).getImm() < 0) return Error(IDLoc, "n must be in range [0,32]"); + break; + case CSKY::ADDC32: + case CSKY::SUBC32: + case CSKY::ADDC16: + case CSKY::SUBC16: + Inst.erase(std::next(Inst.begin())); + Inst.erase(std::prev(Inst.end())); + Inst.insert(std::next(Inst.begin()), MCOperand::createReg(CSKY::C)); + Inst.insert(Inst.end(), MCOperand::createReg(CSKY::C)); + break; + case CSKY::CMPNEI32: + case CSKY::CMPNEI16: + case CSKY::CMPNE32: + case CSKY::CMPNE16: + case CSKY::CMPHSI32: + case CSKY::CMPHSI16: + case CSKY::CMPHS32: + case CSKY::CMPHS16: + case CSKY::CMPLTI32: + case CSKY::CMPLTI16: + case CSKY::CMPLT32: + case CSKY::CMPLT16: + case CSKY::BTSTI32: + Inst.erase(Inst.begin()); + Inst.insert(Inst.begin(), MCOperand::createReg(CSKY::C)); + break; + case CSKY::MVCV32: + Inst.erase(std::next(Inst.begin())); + Inst.insert(Inst.end(), MCOperand::createReg(CSKY::C)); + break; } - Out.emitInstruction(Inst, getSTI()); + emitToStreamer(Out, Inst); return false; } @@ -1422,6 +1465,16 @@ OperandMatchResultTy CSKYAsmParser::tryParseRegister(unsigned &RegNo, bool CSKYAsmParser::ParseDirective(AsmToken DirectiveID) { return true; } +void CSKYAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) { + MCInst CInst; + bool Res = false; + if (EnableCompressedInst) + Res = compressInst(CInst, Inst, getSTI(), S.getContext()); + if (Res) + ++CSKYNumInstrsCompressed; + S.emitInstruction((Res ? CInst : Inst), getSTI()); +} + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYAsmParser() { RegisterMCAsmParser<CSKYAsmParser> X(getTheCSKYTarget()); } diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp index 1c38c5d1fde6..85129f78e726 100644 --- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp +++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp @@ -30,6 +30,9 @@ using namespace llvm; #define DEBUG_TYPE "csky-asm-printer" +STATISTIC(CSKYNumInstrsCompressed, + "Number of C-SKY Compressed instructions emitted"); + CSKYAsmPrinter::CSKYAsmPrinter(llvm::TargetMachine &TM, std::unique_ptr<llvm::MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this) {} @@ -39,6 +42,16 @@ bool CSKYAsmPrinter::runOnMachineFunction(MachineFunction &MF) { return AsmPrinter::runOnMachineFunction(MF); } +#define GEN_COMPRESS_INSTR +#include "CSKYGenCompressInstEmitter.inc" +void CSKYAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) { + MCInst CInst; + bool Res = compressInst(CInst, Inst, *Subtarget, OutStreamer->getContext()); + if (Res) + ++CSKYNumInstrsCompressed; + AsmPrinter::EmitToStreamer(*OutStreamer, Res ? CInst : Inst); +} + // Simple pseudo-instructions have their lowering (with expansion to real // instructions) auto-generated. #include "CSKYGenMCPseudoLowering.inc" diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h index f0f5d8657c04..b30311e0ca64 100644 --- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h +++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h @@ -26,6 +26,8 @@ public: StringRef getPassName() const override { return "CSKY Assembly Printer"; } + void EmitToStreamer(MCStreamer &S, const MCInst &Inst); + /// tblgen'erated driver function for lowering simple MI->MC /// pseudo instructions. bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, diff --git a/llvm/lib/Target/CSKY/CSKYCallingConv.td b/llvm/lib/Target/CSKY/CSKYCallingConv.td index 87e2e6b9dc31..91102e3714df 100644 --- a/llvm/lib/Target/CSKY/CSKYCallingConv.td +++ b/llvm/lib/Target/CSKY/CSKYCallingConv.td @@ -79,4 +79,4 @@ def RetCC_CSKY_ABIV2_FP : CallingConv<[ CCIfType<[i32], CCAssignToReg<[R0, R1]>>, CCIfType<[f32], CCAssignToReg<[F0_32]>>, CCIfType<[f64], CCAssignToReg<[F0_64]>> -]>;
\ No newline at end of file +]>; diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp index 9b22c95cfe21..3a8ee5713584 100644 --- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp @@ -54,4 +54,4 @@ void CSKYFrameLowering::emitPrologue(MachineFunction &MF, void CSKYFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { // FIXME: Implement this when we have function calls -}
\ No newline at end of file +} diff --git a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp index fc9ef8bfd9d9..8dc91904b8cc 100644 --- a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp @@ -40,6 +40,8 @@ public: } void Select(SDNode *N) override; + bool selectAddCarry(SDNode *N); + bool selectSubCarry(SDNode *N); #include "CSKYGenDAGISel.inc" }; @@ -60,7 +62,12 @@ void CSKYDAGToDAGISel::Select(SDNode *N) { switch (Opcode) { default: break; - // FIXME: Add selection nodes needed later. + case ISD::ADDCARRY: + IsSelected = selectAddCarry(N); + break; + case ISD::SUBCARRY: + IsSelected = selectSubCarry(N); + break; } if (IsSelected) @@ -70,6 +77,86 @@ void CSKYDAGToDAGISel::Select(SDNode *N) { SelectCode(N); } +bool CSKYDAGToDAGISel::selectAddCarry(SDNode *N) { + MachineSDNode *NewNode = nullptr; + auto Type0 = N->getValueType(0); + auto Type1 = N->getValueType(1); + auto Op0 = N->getOperand(0); + auto Op1 = N->getOperand(1); + auto Op2 = N->getOperand(2); + + SDLoc Dl(N); + + if (isNullConstant(Op2)) { + auto *CA = CurDAG->getMachineNode( + Subtarget->has2E3() ? CSKY::CLRC32 : CSKY::CLRC16, Dl, Type1); + NewNode = CurDAG->getMachineNode( + Subtarget->has2E3() ? CSKY::ADDC32 : CSKY::ADDC16, Dl, {Type0, Type1}, + {Op0, Op1, SDValue(CA, 0)}); + } else if (isOneConstant(Op2)) { + auto *CA = CurDAG->getMachineNode( + Subtarget->has2E3() ? CSKY::SETC32 : CSKY::SETC16, Dl, Type1); + NewNode = CurDAG->getMachineNode( + Subtarget->has2E3() ? CSKY::ADDC32 : CSKY::ADDC16, Dl, {Type0, Type1}, + {Op0, Op1, SDValue(CA, 0)}); + } else { + NewNode = CurDAG->getMachineNode(Subtarget->has2E3() ? CSKY::ADDC32 + : CSKY::ADDC16, + Dl, {Type0, Type1}, {Op0, Op1, Op2}); + } + ReplaceNode(N, NewNode); + return true; +} + +static SDValue InvertCarryFlag(const CSKYSubtarget *Subtarget, + SelectionDAG *DAG, SDLoc Dl, SDValue OldCarry) { + auto NewCarryReg = + DAG->getMachineNode(Subtarget->has2E3() ? CSKY::MVCV32 : CSKY::MVCV16, Dl, + MVT::i32, OldCarry); + auto NewCarry = + DAG->getMachineNode(Subtarget->hasE2() ? CSKY::BTSTI32 : CSKY::BTSTI16, + Dl, OldCarry.getValueType(), SDValue(NewCarryReg, 0), + DAG->getTargetConstant(0, Dl, MVT::i32)); + return SDValue(NewCarry, 0); +} + +bool CSKYDAGToDAGISel::selectSubCarry(SDNode *N) { + MachineSDNode *NewNode = nullptr; + auto Type0 = N->getValueType(0); + auto Type1 = N->getValueType(1); + auto Op0 = N->getOperand(0); + auto Op1 = N->getOperand(1); + auto Op2 = N->getOperand(2); + + SDLoc Dl(N); + + if (isNullConstant(Op2)) { + auto *CA = CurDAG->getMachineNode( + Subtarget->has2E3() ? CSKY::SETC32 : CSKY::SETC16, Dl, Type1); + NewNode = CurDAG->getMachineNode( + Subtarget->has2E3() ? CSKY::SUBC32 : CSKY::SUBC16, Dl, {Type0, Type1}, + {Op0, Op1, SDValue(CA, 0)}); + } else if (isOneConstant(Op2)) { + auto *CA = CurDAG->getMachineNode( + Subtarget->has2E3() ? CSKY::CLRC32 : CSKY::CLRC16, Dl, Type1); + NewNode = CurDAG->getMachineNode( + Subtarget->has2E3() ? CSKY::SUBC32 : CSKY::SUBC16, Dl, {Type0, Type1}, + {Op0, Op1, SDValue(CA, 0)}); + } else { + auto CarryIn = InvertCarryFlag(Subtarget, CurDAG, Dl, Op2); + NewNode = CurDAG->getMachineNode(Subtarget->has2E3() ? CSKY::SUBC32 + : CSKY::SUBC16, + Dl, {Type0, Type1}, {Op0, Op1, CarryIn}); + } + auto CarryOut = InvertCarryFlag(Subtarget, CurDAG, Dl, SDValue(NewNode, 1)); + + ReplaceUses(SDValue(N, 0), SDValue(NewNode, 0)); + ReplaceUses(SDValue(N, 1), CarryOut); + CurDAG->RemoveDeadNode(N); + + return true; +} + FunctionPass *llvm::createCSKYISelDag(CSKYTargetMachine &TM) { return new CSKYDAGToDAGISel(TM); } diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp index ac6d069e592c..a1f7cc685d4c 100644 --- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp +++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp @@ -37,6 +37,46 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM, // Register Class addRegisterClass(MVT::i32, &CSKY::GPRRegClass); + setOperationAction(ISD::ADDCARRY, MVT::i32, Legal); + setOperationAction(ISD::SUBCARRY, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::CTTZ, MVT::i32, Expand); + setOperationAction(ISD::CTPOP, MVT::i32, Expand); + setOperationAction(ISD::ROTR, MVT::i32, Expand); + setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); + setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::MULHS, MVT::i32, Expand); + setOperationAction(ISD::MULHU, MVT::i32, Expand); + + setLoadExtAction(ISD::EXTLOAD, MVT::i32, MVT::i1, Promote); + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i1, Promote); + setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i1, Promote); + + if (!Subtarget.hasE2()) { + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i8, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i16, Expand); + setOperationAction(ISD::CTLZ, MVT::i32, Expand); + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + } + + if (!Subtarget.has2E3()) { + setOperationAction(ISD::ABS, MVT::i32, Expand); + setOperationAction(ISD::BITREVERSE, MVT::i32, Expand); + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setOperationAction(ISD::UDIV, MVT::i32, Expand); + } + // Compute derived properties from the register classes. computeRegisterProperties(STI.getRegisterInfo()); diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td index 6d42bddcdd78..ea0761d97545 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td +++ b/llvm/lib/Target/CSKY/CSKYInstrFormats16Instr.td @@ -88,6 +88,19 @@ class R16_XZ_UNOP<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst< let Inst{1, 0} = sop; } +class R16_Z_UNOP<bits<4> op, bits<2> sop, string opstr> : CSKY16Inst< + AddrModeNone, (outs sGPR:$rz), (ins sGPR:$rx), !strconcat(opstr, "\t$rz"), + []> { + bits<4> rz; + bits<4> rx; + let Inst{15, 14} = 0b01; + let Inst{13 - 10} = op; + let Inst{9 - 6} = rz; + let Inst{5 - 2} = rx; + let Inst{1, 0} = sop; + let Constraints = "$rz = $rx"; +} + class R16_XY_CMP<bits<2> sop, string opstr> : CSKY16Inst< AddrModeNone, (outs CARRY:$ca), (ins sGPR:$rx, sGPR:$ry), !strconcat(opstr, "\t$rx, $ry"), []> { @@ -146,7 +159,7 @@ class I16_X_CMP<bits<3> sop, string opstr, Operand Immoperand> : CSKY16Inst< } class I16_SP_IMM7<bits<3> sop, string opstr> : CSKY16Inst< - AddrModeNone, (outs SPOp:$sp2), (ins SPOp:$sp1, uimm7_2:$imm7), + AddrModeNone, (outs GPRSP:$sp2), (ins GPRSP:$sp1, uimm7_2:$imm7), !strconcat(opstr, "\t$sp2, $sp1, $imm7"), []> { bits<7> imm7; let Inst{15, 14} = 0b00; diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp index e12235cf9478..6fcb136cd99b 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp @@ -11,6 +11,8 @@ //===----------------------------------------------------------------------===// #include "CSKYInstrInfo.h" +#include "CSKYMachineFunctionInfo.h" +#include "CSKYTargetMachine.h" #include "llvm/MC/MCContext.h" #define DEBUG_TYPE "csky-instr-info" @@ -23,3 +25,289 @@ using namespace llvm; CSKYInstrInfo::CSKYInstrInfo(CSKYSubtarget &STI) : CSKYGenInstrInfo(CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) { } + +Register CSKYInstrInfo::movImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, int64_t Val, + MachineInstr::MIFlag Flag) const { + assert(isUInt<32>(Val) && "should be uint32"); + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + Register DstReg; + if (STI.hasE2()) { + DstReg = MRI.createVirtualRegister(&CSKY::GPRRegClass); + + if (isUInt<16>(Val)) { + BuildMI(MBB, MBBI, DL, get(CSKY::MOVI32), DstReg) + .addImm(Val & 0xFFFF) + .setMIFlags(Flag); + } else if (isShiftedUInt<16, 16>(Val)) { + BuildMI(MBB, MBBI, DL, get(CSKY::MOVIH32), DstReg) + .addImm((Val >> 16) & 0xFFFF) + .setMIFlags(Flag); + } else { + BuildMI(MBB, MBBI, DL, get(CSKY::MOVIH32), DstReg) + .addImm((Val >> 16) & 0xFFFF) + .setMIFlags(Flag); + BuildMI(MBB, MBBI, DL, get(CSKY::ORI32), DstReg) + .addReg(DstReg) + .addImm(Val & 0xFFFF) + .setMIFlags(Flag); + } + + } else { + DstReg = MRI.createVirtualRegister(&CSKY::mGPRRegClass); + if (isUInt<8>(Val)) { + BuildMI(MBB, MBBI, DL, get(CSKY::MOVI16), DstReg) + .addImm(Val & 0xFF) + .setMIFlags(Flag); + } else if (isUInt<16>(Val)) { + BuildMI(MBB, MBBI, DL, get(CSKY::MOVI16), DstReg) + .addImm((Val >> 8) & 0xFF) + .setMIFlags(Flag); + BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg) + .addReg(DstReg) + .addImm(8) + .setMIFlags(Flag); + if ((Val & 0xFF) != 0) + BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg) + .addReg(DstReg) + .addImm(Val & 0xFF) + .setMIFlags(Flag); + } else if (isUInt<24>(Val)) { + BuildMI(MBB, MBBI, DL, get(CSKY::MOVI16), DstReg) + .addImm((Val >> 16) & 0xFF) + .setMIFlags(Flag); + BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg) + .addReg(DstReg) + .addImm(8) + .setMIFlags(Flag); + if (((Val >> 8) & 0xFF) != 0) + BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg) + .addReg(DstReg) + .addImm((Val >> 8) & 0xFF) + .setMIFlags(Flag); + BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg) + .addReg(DstReg) + .addImm(8) + .setMIFlags(Flag); + if ((Val & 0xFF) != 0) + BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg) + .addReg(DstReg) + .addImm(Val & 0xFF) + .setMIFlags(Flag); + } else { + BuildMI(MBB, MBBI, DL, get(CSKY::MOVI16), DstReg) + .addImm((Val >> 24) & 0xFF) + .setMIFlags(Flag); + BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg) + .addReg(DstReg) + .addImm(8) + .setMIFlags(Flag); + if (((Val >> 16) & 0xFF) != 0) + BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg) + .addReg(DstReg) + .addImm((Val >> 16) & 0xFF) + .setMIFlags(Flag); + BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg) + .addReg(DstReg) + .addImm(8) + .setMIFlags(Flag); + if (((Val >> 8) & 0xFF) != 0) + BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg) + .addReg(DstReg) + .addImm((Val >> 8) & 0xFF) + .setMIFlags(Flag); + BuildMI(MBB, MBBI, DL, get(CSKY::LSLI16), DstReg) + .addReg(DstReg) + .addImm(8) + .setMIFlags(Flag); + if ((Val & 0xFF) != 0) + BuildMI(MBB, MBBI, DL, get(CSKY::ADDI16), DstReg) + .addReg(DstReg) + .addImm(Val & 0xFF) + .setMIFlags(Flag); + } + } + + return DstReg; +} + +unsigned CSKYInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + switch (MI.getOpcode()) { + default: + return 0; + case CSKY::LD16B: + case CSKY::LD16H: + case CSKY::LD16W: + case CSKY::LD32B: + case CSKY::LD32BS: + case CSKY::LD32H: + case CSKY::LD32HS: + case CSKY::LD32W: + case CSKY::RESTORE_CARRY: + break; + } + + if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && + MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); + } + + return 0; +} + +unsigned CSKYInstrInfo::isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex) const { + switch (MI.getOpcode()) { + default: + return 0; + case CSKY::ST16B: + case CSKY::ST16H: + case CSKY::ST16W: + case CSKY::ST32B: + case CSKY::ST32H: + case CSKY::ST32W: + case CSKY::SPILL_CARRY: + break; + } + + if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() && + MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); + } + + return 0; +} + +void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register SrcReg, bool IsKill, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) + DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Opcode = 0; + + if (CSKY::GPRRegClass.hasSubClassEq(RC)) { + Opcode = CSKY::ST32W; // Optimize for 16bit + } else if (CSKY::CARRYRegClass.hasSubClassEq(RC)) { + Opcode = CSKY::SPILL_CARRY; + CFI->setSpillsCR(); + } else { + llvm_unreachable("Unknown RegisterClass"); + } + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); + + BuildMI(MBB, I, DL, get(Opcode)) + .addReg(SrcReg, getKillRegState(IsKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); +} + +void CSKYInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + Register DestReg, int FI, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (I != MBB.end()) + DL = I->getDebugLoc(); + + MachineFunction &MF = *MBB.getParent(); + CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Opcode = 0; + + if (CSKY::GPRRegClass.hasSubClassEq(RC)) { + Opcode = CSKY::LD32W; + } else if (CSKY::CARRYRegClass.hasSubClassEq(RC)) { + Opcode = CSKY::RESTORE_CARRY; + CFI->setSpillsCR(); + } else { + llvm_unreachable("Unknown RegisterClass"); + } + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); + + BuildMI(MBB, I, DL, get(Opcode), DestReg) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); +} + +void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc) const { + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + if (CSKY::GPRRegClass.contains(SrcReg) && + CSKY::CARRYRegClass.contains(DestReg)) { + if (STI.hasE2()) { + BuildMI(MBB, I, DL, get(CSKY::BTSTI32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); + } else { + assert(SrcReg < CSKY::R8); + BuildMI(MBB, I, DL, get(CSKY::BTSTI16), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); + } + return; + } + + if (CSKY::CARRYRegClass.contains(SrcReg) && + CSKY::GPRRegClass.contains(DestReg)) { + + if (STI.hasE2()) { + BuildMI(MBB, I, DL, get(CSKY::MVC32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + assert(DestReg < CSKY::R16); + assert(DestReg < CSKY::R8); + BuildMI(MBB, I, DL, get(CSKY::MOVI16), DestReg).addImm(0); + BuildMI(MBB, I, DL, get(CSKY::ADDC16)) + .addReg(DestReg, RegState::Define) + .addReg(SrcReg, RegState::Define) + .addReg(DestReg, getKillRegState(true)) + .addReg(DestReg, getKillRegState(true)) + .addReg(SrcReg, getKillRegState(true)); + BuildMI(MBB, I, DL, get(CSKY::BTSTI16)) + .addReg(SrcReg, RegState::Define | getDeadRegState(KillSrc)) + .addReg(DestReg) + .addImm(0); + } + return; + } + + unsigned Opcode = 0; + if (CSKY::GPRRegClass.contains(DestReg, SrcReg)) + Opcode = CSKY::MOV32; + else { + LLVM_DEBUG(dbgs() << "src = " << SrcReg << ", dst = " << DestReg); + LLVM_DEBUG(I->dump()); + llvm_unreachable("Unknown RegisterClass"); + } + + BuildMI(MBB, I, DL, get(Opcode), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); +} diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h index 04be9da27b57..450641d96b74 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h @@ -29,6 +29,31 @@ protected: public: explicit CSKYInstrInfo(CSKYSubtarget &STI); + + unsigned isLoadFromStackSlot(const MachineInstr &MI, + int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr &MI, + int &FrameIndex) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, Register SrcReg, + bool IsKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, Register DestReg, + int FrameIndex, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, + bool KillSrc) const override; + + // Materializes the given integer Val into DstReg. + Register movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, int64_t Val, + MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const; }; } // namespace llvm diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td index 9dda3159e446..30d9206eec68 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td @@ -52,6 +52,11 @@ class OImmAsmOperand<int width, string suffix = ""> : ImmAsmOperand<"O", width, suffix> { } +def to_tframeindex : SDNodeXForm<frameindex, [{ + auto FI = cast<FrameIndexSDNode>(N); + return CurDAG->getTargetFrameIndex(FI->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout())); +}]>; + class oimm<int num> : Operand<i32>, ImmLeaf<i32, "return isUInt<"#num#">(Imm - 1);"> { let EncoderMethod = "getOImmOpValue"; @@ -166,9 +171,23 @@ def bare_symbol : Operand<iPTR> { let OperandType = "OPERAND_PCREL"; } -def oimm3 : oimm<3>; +def oimm3 : oimm<3> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isUInt<3>(Imm - 1); + return MCOp.isBareSymbolRef(); + }]; +} def oimm4 : oimm<4>; -def oimm5 : oimm<5>; +def oimm5 : oimm<5> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isUInt<5>(Imm - 1); + return MCOp.isBareSymbolRef(); + }]; +} def oimm6 : oimm<6>; def imm5_idly : Operand<i32>, ImmLeaf<i32, @@ -177,9 +196,30 @@ def imm5_idly : Operand<i32>, ImmLeaf<i32, let DecoderMethod = "decodeOImmOperand<5>"; } -def oimm8 : oimm<8>; -def oimm12 : oimm<12>; -def oimm16 : oimm<16>; +def oimm8 : oimm<8> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isUInt<8>(Imm - 1); + return MCOp.isBareSymbolRef(); + }]; +} +def oimm12 : oimm<12> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isUInt<12>(Imm - 1); + return MCOp.isBareSymbolRef(); + }]; +} +def oimm16 : oimm<16> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isUInt<16>(Imm - 1); + return MCOp.isBareSymbolRef(); + }]; +} def nimm12 : nimm<12>; @@ -195,28 +235,98 @@ def uimm2_jmpix : Operand<i32>, def uimm3 : uimm<3>; def uimm4 : uimm<4>; -def uimm5 : uimm<5>; +def uimm5 : uimm<5> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<5, 0>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} def uimm5_msb_size : uimm<5> { let EncoderMethod = "getImmOpValueMSBSize"; } -def uimm5_1 : uimm<5, 1>; -def uimm5_2 : uimm<5, 2>; +def uimm5_1 : uimm<5, 1> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<5, 1>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} +def uimm5_2 : uimm<5, 2> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<5, 2>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} def uimm6 : uimm<6>; def uimm7 : uimm<7>; def uimm7_1 : uimm<7, 1>; -def uimm7_2 : uimm<7, 2>; +def uimm7_2 : uimm<7, 2>{ + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<7, 2>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} def uimm7_3 : uimm<7, 3>; -def uimm8 : uimm<8>; -def uimm8_2 : uimm<8, 2>; +def uimm8 : uimm<8> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<8, 0>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} +def uimm8_2 : uimm<8, 2> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<8, 2>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} def uimm8_3 : uimm<8, 3>; def uimm8_8 : uimm<8, 8>; def uimm8_16 : uimm<8, 16>; def uimm8_24 : uimm<8, 24>; -def uimm12 : uimm<12>; -def uimm12_1 : uimm<12, 1>; -def uimm12_2 : uimm<12, 2>; -def uimm16 : uimm<16>; +def uimm12 : uimm<12> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<12, 0>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} +def uimm12_1 : uimm<12, 1> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<12, 1>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} +def uimm12_2 : uimm<12, 2> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<12, 2>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} +def uimm16 : uimm<16> { + let MCOperandPredicate = [{ + int64_t Imm; + if (MCOp.evaluateAsConstantImm(Imm)) + return isShiftedUInt<16, 0>(Imm); + return MCOp.isBareSymbolRef(); + }]; +} def uimm16_8 : uimm<16, 8>; def uimm16_16 : uimm<16, 16>; def uimm20 : uimm<20>; @@ -642,11 +752,6 @@ def BSR32_BR : J<0x38, (outs), (ins call_symbol:$offset), "bsr32", []>{ let Defs = [ R15 ]; } -let Predicates = [iHasE2], isCodeGenOnly = 1 in { - def RTS32 : I_16_RET<0x6, 0xF, "rts32", [(CSKY_RET)]>; -} - - //===----------------------------------------------------------------------===// // Symbol address instructions. //===----------------------------------------------------------------------===// @@ -872,6 +977,102 @@ def TRAP32 : CSKY32Inst<AddrModeNone, 0x30, (outs), (ins uimm2:$imm2), "trap32 $ } +//===----------------------------------------------------------------------===// +// Instruction Patterns. +//===----------------------------------------------------------------------===// + +// Load & Store Patterns +multiclass LdPat<PatFrag LoadOp, ImmLeaf imm_type, Instruction Inst, ValueType Type> { + def : Pat<(Type (LoadOp GPR:$rs1)), (Inst GPR:$rs1, 0)>; + def : Pat<(Type (LoadOp (i32 frameindex:$rs1))), (Inst (i32 (to_tframeindex tframeindex:$rs1)), 0)>; + def : Pat<(Type (LoadOp (add GPR:$rs1, imm_type:$uimm))), + (Inst GPR:$rs1, imm_type:$uimm)>; + def : Pat<(Type (LoadOp (add frameindex:$rs1, imm_type:$uimm))), + (Inst (i32 (to_tframeindex tframeindex:$rs1)), imm_type:$uimm)>; + def : Pat<(Type (LoadOp (eqToAdd frameindex:$rs1, imm_type:$uimm))), + (Inst (i32 (to_tframeindex tframeindex:$rs1)), imm_type:$uimm)>; + def : Pat<(Type (LoadOp (add GPR:$rs1, tglobaladdr:$gd))), + (Inst GPR:$rs1, tglobaladdr:$gd)>; +} + +defm : LdPat<extloadi8, uimm12, LD32B, i32>; +defm : LdPat<zextloadi8, uimm12, LD32B, i32>; +let Predicates = [iHasE2] in { + defm : LdPat<sextloadi8, uimm12, LD32BS, i32>; +} +defm : LdPat<extloadi16, uimm12_1, LD32H, i32>; +defm : LdPat<zextloadi16, uimm12_1, LD32H, i32>; +let Predicates = [iHasE2] in { +defm : LdPat<sextloadi16, uimm12_1, LD32HS, i32>; +} +defm : LdPat<load, uimm12_2, LD32W, i32>; + +multiclass LdrPat<PatFrag LoadOp, Instruction Inst, ValueType Type> { + def : Pat<(Type (LoadOp (add GPR:$rs1, GPR:$rs2))), (Inst GPR:$rs1, GPR:$rs2, 0)>; + def : Pat<(Type (LoadOp (add GPR:$rs1, (shl GPR:$rs2, (i32 1))))), (Inst GPR:$rs1, GPR:$rs2, 1)>; + def : Pat<(Type (LoadOp (add GPR:$rs1, (shl GPR:$rs2, (i32 2))))), (Inst GPR:$rs1, GPR:$rs2, 2)>; + def : Pat<(Type (LoadOp (add GPR:$rs1, (shl GPR:$rs2, (i32 3))))), (Inst GPR:$rs1, GPR:$rs2, 3)>; +} + +let Predicates = [iHas2E3] in { + defm : LdrPat<zextloadi8, LDR32B, i32>; + defm : LdrPat<sextloadi8, LDR32BS, i32>; + defm : LdrPat<extloadi8, LDR32BS, i32>; + defm : LdrPat<zextloadi16, LDR32H, i32>; + defm : LdrPat<sextloadi16, LDR32HS, i32>; + defm : LdrPat<extloadi16, LDR32HS, i32>; + defm : LdrPat<load, LDR32W, i32>; +} + +multiclass StPat<PatFrag StoreOp, ValueType Type, ImmLeaf imm_type, Instruction Inst> { + def : Pat<(StoreOp Type:$rs2, GPR:$rs1), (Inst Type:$rs2, GPR:$rs1, 0)>; + def : Pat<(StoreOp Type:$rs2, frameindex:$rs1), (Inst Type:$rs2, (i32 (to_tframeindex tframeindex:$rs1)), 0)>; + def : Pat<(StoreOp Type:$rs2, (add GPR:$rs1, imm_type:$uimm12)), + (Inst Type:$rs2, GPR:$rs1, imm_type:$uimm12)>; + def : Pat<(StoreOp Type:$rs2, (add frameindex:$rs1, imm_type:$uimm12)), + (Inst Type:$rs2, (i32 (to_tframeindex tframeindex:$rs1)), imm_type:$uimm12)>; + def : Pat<(StoreOp Type:$rs2, (eqToAdd frameindex:$rs1, imm_type:$uimm12)), + (Inst Type:$rs2, (i32 (to_tframeindex tframeindex:$rs1)), imm_type:$uimm12)>; +} + +defm : StPat<truncstorei8, i32, uimm12, ST32B>; +defm : StPat<truncstorei16, i32, uimm12_1, ST32H>; +defm : StPat<store, i32, uimm12_2, ST32W>; + +multiclass StrPat<PatFrag StoreOp, ValueType Type, Instruction Inst> { + def : Pat<(StoreOp Type:$rz, (add GPR:$rs1, GPR:$rs2)), (Inst Type:$rz, GPR:$rs1, GPR:$rs2, 0)>; + def : Pat<(StoreOp Type:$rz, (add GPR:$rs1, (shl GPR:$rs2, (i32 1)))), (Inst Type:$rz, GPR:$rs1, GPR:$rs2, 1)>; + def : Pat<(StoreOp Type:$rz, (add GPR:$rs1, (shl GPR:$rs2, (i32 2)))), (Inst Type:$rz, GPR:$rs1, GPR:$rs2, 2)>; + def : Pat<(StoreOp Type:$rz, (add GPR:$rs1, (shl GPR:$rs2, (i32 3)))), (Inst Type:$rz, GPR:$rs1, GPR:$rs2, 3)>; +} + +let Predicates = [iHas2E3] in { + defm : StrPat<truncstorei8, i32, STR32B>; + defm : StrPat<truncstorei16, i32, STR32H>; + defm : StrPat<store, i32, STR32W>; + + // Sext & Zext Patterns + def : Pat<(sext_inreg GPR:$src, i1), (SEXT32 GPR:$src, 0, 0)>; + def : Pat<(and GPR:$src, 255), (ZEXT32 GPR:$src, 7, 0)>; + def : Pat<(and GPR:$src, 65535), (ZEXT32 GPR:$src, 15, 0)>; +} + +// Constant materialize patterns. +let Predicates = [iHasE2] in + def : Pat<(i32 imm:$imm), + (ORI32 (MOVIH32 (uimm32_hi16 imm:$imm)), (uimm32_lo16 imm:$imm))>; + + +// Other operations. +let Predicates = [iHasE2] in { + def : Pat<(rotl GPR:$rs1, GPR:$rs2), + (ROTL32 GPR:$rs1, (ANDI32 GPR:$rs2, 0x1f))>; + let Predicates = [iHas2E3] in { + def : Pat<(bitreverse GPR:$rx), (BREV32 GPR:$rx)>; + def : Pat<(bswap GPR:$rx), (REVB32 GPR:$rx)>; + } + def : Pat<(i32 (ctlz GPR:$rx)), (FF1 GPR:$rx)>; +} //===----------------------------------------------------------------------===// // Pseudo for assembly diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td index c98f43622155..6a9dd03dfa1d 100644 --- a/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td +++ b/llvm/lib/Target/CSKY/CSKYInstrInfo16Instr.td @@ -33,16 +33,6 @@ def br_symbol_16bit : Operand<iPTR> { let OperandType = "OPERAND_PCREL"; } -def SPOperand : AsmOperandClass { - let Name = "SPOperand"; - let RenderMethod = "addRegOperands"; - let DiagnosticType = !strconcat("Invalid", Name); -} - -def SPOp : RegisterOperand<GPR> { - let ParserMatchClass = SPOperand; -} - def constpool_symbol_16bit : Operand<iPTR> { let ParserMatchClass = Constpool; let EncoderMethod = @@ -83,7 +73,7 @@ let isCommutable = 1 in { def XOR16 : R16_XZ_BINOP<0b1011, 0b01, "xor16", BinOpFrag<(xor node:$LHS, node:$RHS)>>; def NOR16 : R16_XZ_BINOP<0b1011, 0b10, "nor16", BinOpFrag<(not (or node:$LHS, node:$RHS))>>; let isCodeGenOnly = 1 in - def NOT16 : R16_XZ_UNOP<0b1011, 0b10, "not16">; + def NOT16 : R16_Z_UNOP<0b1011, 0b10, "not16">; def MULT16 : R16_XZ_BINOP<0b1111, 0b00, "mult16", BinOpFrag<(mul node:$LHS, node:$RHS)>>; } def SUBU16XZ : R16_XZ_BINOP<0b1000, 0b10, "subu16", BinOpFrag<(sub node:$LHS, node:$RHS)>>; @@ -108,7 +98,7 @@ let Constraints = "$rZ = $rz", isReMaterializable = 1, isAsCheapAsAMove = 1 in { } let isAdd = 1 in -def ADDI16ZSP : I16_Z_8<0b011, (ins SPOp:$sp, uimm8_2:$imm8), +def ADDI16ZSP : I16_Z_8<0b011, (ins GPRSP:$sp, uimm8_2:$imm8), "addi16\t$rz, $sp, $imm8">; let isAdd = 1 in @@ -150,9 +140,9 @@ def ST16W : I16_XZ_LDST<AddrMode16W, 0b110, "st16.w", (outs), (ins mGPR:$rz, mGPR:$rx, uimm5_2:$imm)>; def LD16WSP : I16_ZSP_LDST<AddrMode16W, 0b011, "ld16.w", - (outs mGPR:$rz), (ins SPOp:$sp, uimm8_2:$addr)>; + (outs mGPR:$rz), (ins GPRSP:$sp, uimm8_2:$addr)>; def ST16WSP : I16_ZSP_LDST<AddrMode16W, 0b111, "st16.w", - (outs), (ins mGPR:$rz, SPOp:$sp, uimm8_2:$addr)>; + (outs), (ins mGPR:$rz, GPRSP:$sp, uimm8_2:$addr)>; //===----------------------------------------------------------------------===// // Compare instructions. @@ -450,3 +440,150 @@ def JBF16 : JBranchPseudo<(outs), let mayLoad = 1, Size = 2, isCodeGenOnly = 0 in def PseudoLRW16 : CSKYPseudo<(outs mGPR:$rz), (ins bare_symbol:$src), "lrw16 $rz, $src", []>; + + +//===----------------------------------------------------------------------===// +// Compress Instruction tablegen backend. +//===----------------------------------------------------------------------===// + +def : CompressPat<(ADDU32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (ADDU16XZ sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(ADDU32 sGPR:$rd, sGPR:$rs1, sGPR:$rd), + (ADDU16XZ sGPR:$rd, sGPR:$rs1)>; +def : CompressPat<(ADDU32 mGPR:$rd, mGPR:$rs1, mGPR:$rs2), + (ADDU16 mGPR:$rd, mGPR:$rs1, mGPR:$rs2)>; +def : CompressPat<(SUBU32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (SUBU16XZ sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(SUBU32 mGPR:$rd, mGPR:$rs1, mGPR:$rs2), + (SUBU16 mGPR:$rd, mGPR:$rs1, mGPR:$rs2)>; + +def : CompressPat< + (ADDC32 sGPR:$rd, CARRY:$cout, sGPR:$rd, sGPR:$rs2, CARRY:$cout), + (ADDC16 sGPR:$rd, CARRY:$cout, sGPR:$rs2, CARRY:$cout) + >; +def : CompressPat< + (SUBC32 sGPR:$rd, CARRY:$cout, sGPR:$rd, sGPR:$rs2, CARRY:$cout), + (SUBC16 sGPR:$rd, CARRY:$cout, sGPR:$rs2, CARRY:$cout) + >; + +def : CompressPat<(ADDI32 mGPR:$rd, mGPR:$rs, oimm3:$imm), + (ADDI16XZ mGPR:$rd, mGPR:$rs, oimm3:$imm)>; +def : CompressPat<(SUBI32 mGPR:$rd, mGPR:$rs, oimm3:$imm), + (SUBI16XZ mGPR:$rd, mGPR:$rs, oimm3:$imm)>; + +def : CompressPat<(ADDI32 mGPR:$rd, mGPR:$rd, oimm8:$imm), + (ADDI16 mGPR:$rd, oimm8:$imm)>; +def : CompressPat<(SUBI32 mGPR:$rd, mGPR:$rd, oimm8:$imm), + (SUBI16 mGPR:$rd, oimm8:$imm)>; + +def : CompressPat<(ADDI32 GPRSP:$sp, GPRSP:$sp, uimm7_2:$imm), + (ADDI16SPSP GPRSP:$sp, GPRSP:$sp, uimm7_2:$imm)>; +def : CompressPat<(SUBI32 GPRSP:$sp, GPRSP:$sp, uimm7_2:$imm), + (SUBI16SPSP GPRSP:$sp, GPRSP:$sp, uimm7_2:$imm)>; + +def : CompressPat<(ADDI32 mGPR:$rd, GPRSP:$sp, uimm8_2:$imm), + (ADDI16ZSP mGPR:$rd, GPRSP:$sp, uimm8_2:$imm)>; + +def : CompressPat<(MULT32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (MULT16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(MULT32 sGPR:$rd, sGPR:$rs1, sGPR:$rd), + (MULT16 sGPR:$rd, sGPR:$rs1)>; +def : CompressPat<(AND32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (AND16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(AND32 sGPR:$rd, sGPR:$rs1, sGPR:$rd), + (AND16 sGPR:$rd, sGPR:$rs1)>; +def : CompressPat<(OR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (OR16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(OR32 sGPR:$rd, sGPR:$rs1, sGPR:$rd), + (OR16 sGPR:$rd, sGPR:$rs1)>; +def : CompressPat<(XOR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (XOR16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(XOR32 sGPR:$rd, sGPR:$rs1, sGPR:$rd), + (XOR16 sGPR:$rd, sGPR:$rs1)>; + +def : CompressPat<(ANDN32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (ANDN16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(NOR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (NOR16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(LSL32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (LSL16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(LSR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (LSR16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(ASR32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (ASR16 sGPR:$rd, sGPR:$rs2)>; +def : CompressPat<(ROTL32 sGPR:$rd, sGPR:$rd, sGPR:$rs2), + (ROTL16 sGPR:$rd, sGPR:$rs2)>; + +def : CompressPat<(NOT32 sGPR:$rd, sGPR:$rd), + (NOT16 sGPR:$rd)>; + +let Predicates = [iHas2E3] in +def : CompressPat<(REVB32 sGPR:$rd, sGPR:$rs), + (REVB16 sGPR:$rd, sGPR:$rs)>; + +def : CompressPat<(LSLI32 mGPR:$rd, mGPR:$rs, uimm5:$imm), + (LSLI16 mGPR:$rd, mGPR:$rs, uimm5:$imm)>; +def : CompressPat<(LSRI32 mGPR:$rd, mGPR:$rs, uimm5:$imm), + (LSRI16 mGPR:$rd, mGPR:$rs, uimm5:$imm)>; +def : CompressPat<(ASRI32 mGPR:$rd, mGPR:$rs, uimm5:$imm), + (ASRI16 mGPR:$rd, mGPR:$rs, uimm5:$imm)>; + +def : CompressPat<(CMPHS32 CARRY:$ca, sGPR:$rs1, sGPR:$rs2), + (CMPHS16 CARRY:$ca, sGPR:$rs1, sGPR:$rs2)>; +def : CompressPat<(CMPLT32 CARRY:$ca, sGPR:$rs1, sGPR:$rs2), + (CMPLT16 CARRY:$ca, sGPR:$rs1, sGPR:$rs2)>; +def : CompressPat<(CMPNE32 CARRY:$ca, sGPR:$rs1, sGPR:$rs2), + (CMPNE16 CARRY:$ca, sGPR:$rs1, sGPR:$rs2)>; + +def : CompressPat<(CMPHSI32 CARRY:$ca, mGPR:$rs, oimm5:$imm), + (CMPHSI16 CARRY:$ca, mGPR:$rs, oimm5:$imm)>; +def : CompressPat<(CMPLTI32 CARRY:$ca, mGPR:$rs, oimm5:$imm), + (CMPLTI16 CARRY:$ca, mGPR:$rs, oimm5:$imm)>; +def : CompressPat<(CMPNEI32 CARRY:$ca, mGPR:$rs, uimm5:$imm), + (CMPNEI16 CARRY:$ca, mGPR:$rs, uimm5:$imm)>; + +def : CompressPat<(JSR32 sGPR:$rd), + (JSR16 sGPR:$rd)>; + + +def : CompressPat<(MVCV32 sGPR:$rd, CARRY:$ca), + (MVCV16 sGPR:$rd, CARRY:$ca)>; +def : CompressPat<(MOV32 sGPR:$rd, sGPR:$ca), + (MOV16 sGPR:$rd, sGPR:$ca)>; +def : CompressPat<(MOVI32 mGPR:$rd, uimm8:$imm), + (MOVI16 mGPR:$rd, uimm8:$imm)>; + +def : CompressPat<(LD32B mGPR:$rd, mGPR:$rs, uimm5:$imm), + (LD16B mGPR:$rd, mGPR:$rs, uimm5:$imm)>; +def : CompressPat<(LD32H mGPR:$rd, mGPR:$rs, uimm5_1:$imm), + (LD16H mGPR:$rd, mGPR:$rs, uimm5_1:$imm)>; +def : CompressPat<(LD32W mGPR:$rd, mGPR:$rs, uimm5_2:$imm), + (LD16W mGPR:$rd, mGPR:$rs, uimm5_2:$imm)>; +def : CompressPat<(LD32W mGPR:$rd, GPRSP:$sp, uimm8_2:$imm), + (LD16WSP mGPR:$rd, GPRSP:$sp, uimm8_2:$imm)>; + +def : CompressPat<(ST32B mGPR:$rd, mGPR:$rs, uimm5:$imm), + (ST16B mGPR:$rd, mGPR:$rs, uimm5:$imm)>; +def : CompressPat<(ST32H mGPR:$rd, mGPR:$rs, uimm5_1:$imm), + (ST16H mGPR:$rd, mGPR:$rs, uimm5_1:$imm)>; +def : CompressPat<(ST32W mGPR:$rd, mGPR:$rs, uimm5_2:$imm), + (ST16W mGPR:$rd, mGPR:$rs, uimm5_2:$imm)>; +def : CompressPat<(ST32W mGPR:$rd, GPRSP:$sp, uimm8_2:$imm), + (ST16WSP mGPR:$rd, GPRSP:$sp, uimm8_2:$imm)>; + +let Predicates = [HasBTST16] in +def : CompressPat<(BTSTI32 CARRY:$ca, mGPR:$rs, uimm5:$imm), + (BTSTI16 CARRY:$ca, mGPR:$rs, uimm5:$imm)>; +def : CompressPat<(BCLRI32 mGPR:$rd, mGPR:$rd, uimm5:$imm), + (BCLRI16 mGPR:$rd, uimm5:$imm)>; +def : CompressPat<(BSETI32 mGPR:$rd, mGPR:$rd, uimm5:$imm), + (BSETI16 mGPR:$rd, uimm5:$imm)>; + +def : CompressPat<(ZEXTB32 sGPR:$rd, sGPR:$rs), + (ZEXTB16 sGPR:$rd, sGPR:$rs)>; +def : CompressPat<(ZEXTH32 sGPR:$rd, sGPR:$rs), + (ZEXTH16 sGPR:$rd, sGPR:$rs)>; +def : CompressPat<(SEXTB32 sGPR:$rd, sGPR:$rs), + (SEXTB16 sGPR:$rd, sGPR:$rs)>; +def : CompressPat<(SEXTH32 sGPR:$rd, sGPR:$rs), + (SEXTH16 sGPR:$rd, sGPR:$rs)>; diff --git a/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp b/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp index c42a56bfb04e..7e0b9bcd7549 100644 --- a/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp +++ b/llvm/lib/Target/CSKY/CSKYMCInstLower.cpp @@ -114,4 +114,4 @@ bool CSKYMCInstLower::lowerOperand(const MachineOperand &MO, break; } return true; -}
\ No newline at end of file +} diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp b/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp index a1d45fea534b..57b6ae3c27b5 100644 --- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp +++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.cpp @@ -88,8 +88,187 @@ CSKYRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_I32_SaveList; } +static bool IsLegalOffset(const CSKYInstrInfo *TII, MachineInstr *MI, + int &Offset) { + const MCInstrDesc &Desc = MI->getDesc(); + unsigned AddrMode = (Desc.TSFlags & CSKYII::AddrModeMask); + unsigned i = 0; + for (; !MI->getOperand(i).isFI(); ++i) { + assert(i + 1 < MI->getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + } + + if (MI->getOpcode() == CSKY::ADDI32) { + if (!isUInt<12>(std::abs(Offset) - 1)) + return false; + if (Offset < 0) { + MI->setDesc(TII->get(CSKY::SUBI32)); + Offset = -Offset; + } + + return true; + } + + if (MI->getOpcode() == CSKY::ADDI16XZ) + return false; + + if (Offset < 0) + return false; + + unsigned NumBits = 0; + unsigned Scale = 1; + switch (AddrMode) { + case CSKYII::AddrMode32B: + Scale = 1; + NumBits = 12; + break; + case CSKYII::AddrMode32H: + Scale = 2; + NumBits = 12; + break; + case CSKYII::AddrMode32WD: + Scale = 4; + NumBits = 12; + break; + case CSKYII::AddrMode16B: + Scale = 1; + NumBits = 5; + break; + case CSKYII::AddrMode16H: + Scale = 2; + NumBits = 5; + break; + case CSKYII::AddrMode16W: + Scale = 4; + NumBits = 5; + break; + case CSKYII::AddrMode32SDF: + Scale = 4; + NumBits = 8; + break; + default: + llvm_unreachable("Unsupported addressing mode!"); + } + + // Cannot encode offset. + if ((Offset & (Scale - 1)) != 0) + return false; + + unsigned Mask = (1 << NumBits) - 1; + if ((unsigned)Offset <= Mask * Scale) + return true; + + // Offset out of range. + return false; +} + void CSKYRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { assert(SPAdj == 0 && "Unexpected non-zero SPAdj value"); -}
\ No newline at end of file + + MachineInstr *MI = &*II; + MachineBasicBlock &MBB = *MI->getParent(); + MachineFunction &MF = *MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const CSKYInstrInfo *TII = MF.getSubtarget<CSKYSubtarget>().getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + const CSKYSubtarget &STI = MF.getSubtarget<CSKYSubtarget>(); + + switch (MI->getOpcode()) { + default: + break; + case CSKY::RESTORE_CARRY: { + Register NewReg = STI.hasE2() + ? MRI.createVirtualRegister(&CSKY::GPRRegClass) + : MRI.createVirtualRegister(&CSKY::mGPRRegClass); + + auto *Temp = BuildMI(MBB, II, DL, TII->get(CSKY::LD32W), NewReg) + .add(MI->getOperand(1)) + .add(MI->getOperand(2)) + .getInstr(); + + BuildMI(MBB, II, DL, TII->get(STI.hasE2() ? CSKY::BTSTI32 : CSKY::BTSTI16), + MI->getOperand(0).getReg()) + .addReg(NewReg, getKillRegState(true)) + .addImm(0); + + MI = Temp; + + MBB.erase(II); + break; + } + case CSKY::SPILL_CARRY: { + Register NewReg; + if (STI.hasE2()) { + NewReg = MRI.createVirtualRegister(&CSKY::GPRRegClass); + BuildMI(MBB, II, DL, TII->get(CSKY::MVC32), NewReg) + .add(MI->getOperand(0)); + } else { + NewReg = MRI.createVirtualRegister(&CSKY::mGPRRegClass); + BuildMI(MBB, II, DL, TII->get(CSKY::MOVI16), NewReg).addImm(0); + BuildMI(MBB, II, DL, TII->get(CSKY::ADDC16)) + .addReg(NewReg, RegState::Define) + .addReg(MI->getOperand(0).getReg(), RegState::Define) + .addReg(NewReg, getKillRegState(true)) + .addReg(NewReg, getKillRegState(true)) + .addReg(MI->getOperand(0).getReg()); + + BuildMI(MBB, II, DL, TII->get(CSKY::BTSTI16), MI->getOperand(0).getReg()) + .addReg(NewReg) + .addImm(0); + } + + MI = BuildMI(MBB, II, DL, TII->get(CSKY::ST32W)) + .addReg(NewReg, getKillRegState(true)) + .add(MI->getOperand(1)) + .add(MI->getOperand(2)) + .getInstr(); + + MBB.erase(II); + + break; + } + } + + int FrameIndex = MI->getOperand(FIOperandNum).getIndex(); + Register FrameReg; + int Offset = getFrameLowering(MF) + ->getFrameIndexReference(MF, FrameIndex, FrameReg) + .getFixed() + + MI->getOperand(FIOperandNum + 1).getImm(); + + if (!isInt<32>(Offset)) + report_fatal_error( + "Frame offsets outside of the signed 32-bit range not supported"); + + bool FrameRegIsKill = false; + MachineBasicBlock::iterator NewII(MI); + if (!IsLegalOffset(TII, MI, Offset)) { + assert(isInt<32>(Offset) && "Int32 expected"); + // The offset won't fit in an immediate, so use a scratch register instead + // Modify Offset and FrameReg appropriately + assert(Offset >= 0); + Register ScratchReg = TII->movImm(MBB, NewII, DL, Offset); + BuildMI(MBB, NewII, DL, + TII->get(STI.hasE2() ? CSKY::ADDU32 : CSKY::ADDU16XZ), ScratchReg) + .addReg(ScratchReg, RegState::Kill) + .addReg(FrameReg); + + Offset = 0; + FrameReg = ScratchReg; + FrameRegIsKill = true; + } + + if (Offset == 0 && + (MI->getOpcode() == CSKY::ADDI32 || MI->getOpcode() == CSKY::ADDI16XZ)) { + MI->setDesc(TII->get(TargetOpcode::COPY)); + MI->getOperand(FIOperandNum) + .ChangeToRegister(FrameReg, false, false, FrameRegIsKill); + MI->RemoveOperand(FIOperandNum + 1); + } else { + MI->getOperand(FIOperandNum) + .ChangeToRegister(FrameReg, false, false, FrameRegIsKill); + MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); + } +} diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.h b/llvm/lib/Target/CSKY/CSKYRegisterInfo.h index 779ea6493c7e..5b3b62ec0db2 100644 --- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.h +++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.h @@ -38,6 +38,18 @@ public: void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override; + + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override { + return true; + } + + bool requiresRegisterScavenging(const MachineFunction &MF) const override { + return true; + } + + bool useFPForScavengingIndex(const MachineFunction &MF) const override { + return false; + } }; } // namespace llvm diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td index 7548c22bb2c5..ade5c7f795af 100644 --- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td +++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td @@ -168,6 +168,11 @@ def mGPR : RegisterClass<"CSKY", [i32], 32, let Size = 32; } +// Register class for SP only. +def GPRSP : RegisterClass<"CSKY", [i32], 32, (add R14)> { + let Size = 32; +} + def GPRPair : RegisterClass<"CSKY", [untyped], 32, (add GPRTuple)> { let Size = 64; } diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td index 7518fd774a48..ae811b30434d 100644 --- a/llvm/lib/Target/Hexagon/Hexagon.td +++ b/llvm/lib/Target/Hexagon/Hexagon.td @@ -29,6 +29,8 @@ def ProcTinyCore: SubtargetFeature<"tinycore", "HexagonProcFamily", // Hexagon ISA Extensions def ExtensionZReg: SubtargetFeature<"zreg", "UseZRegOps", "true", "Hexagon ZReg extension instructions">; +def ExtensionHVXQFloat: SubtargetFeature<"hvx-qfloat", "UseHVXQFloatOps", + "true", "Hexagon HVX QFloating point instructions">; def ExtensionHVX: SubtargetFeature<"hvx", "HexagonHVXVersion", "Hexagon::ArchEnum::V60", "Hexagon HVX instructions">; @@ -52,6 +54,10 @@ def ExtensionHVXV68: SubtargetFeature<"hvxv68", "HexagonHVXVersion", "Hexagon::ArchEnum::V68", "Hexagon HVX instructions", [ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66, ExtensionHVXV67]>; +def ExtensionHVXV69: SubtargetFeature<"hvxv69", "HexagonHVXVersion", + "Hexagon::ArchEnum::V69", "Hexagon HVX instructions", + [ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66, + ExtensionHVXV67, ExtensionHVXV68]>; def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps", "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>; @@ -61,6 +67,9 @@ def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps", def ExtensionAudio: SubtargetFeature<"audio", "UseAudioOps", "true", "Hexagon Audio extension instructions">; +def ExtensionHVXIEEEFP: SubtargetFeature<"hvx-ieee-fp", "UseHVXIEEEFPOps", + "true", "Hexagon HVX IEEE floating point instructions">; + def FeatureCompound: SubtargetFeature<"compound", "UseCompound", "true", "Use compound instructions">; def FeaturePackets: SubtargetFeature<"packets", "UsePackets", "true", @@ -88,6 +97,8 @@ def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19", def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim", "NoreturnStackElim", "true", "Eliminate stack allocation in a noreturn function when possible">; +def FeatureCabac: SubtargetFeature<"cabac", "UseCabac", "false", + "Emit the CABAC instruction">; //===----------------------------------------------------------------------===// // Hexagon Instruction Predicate Definitions. @@ -112,6 +123,8 @@ def UseHVXV67 : Predicate<"HST->useHVXV67Ops()">, AssemblerPredicate<(all_of ExtensionHVXV67)>; def UseHVXV68 : Predicate<"HST->useHVXV68Ops()">, AssemblerPredicate<(all_of ExtensionHVXV68)>; +def UseHVXV69 : Predicate<"HST->useHVXV69Ops()">, + AssemblerPredicate<(all_of ExtensionHVXV69)>; def UseAudio : Predicate<"HST->useAudioOps()">, AssemblerPredicate<(all_of ExtensionAudio)>; def UseZReg : Predicate<"HST->useZRegOps()">, @@ -119,6 +132,11 @@ def UseZReg : Predicate<"HST->useZRegOps()">, def UseCompound : Predicate<"HST->useCompound()">; def HasPreV65 : Predicate<"HST->hasPreV65()">, AssemblerPredicate<(all_of FeaturePreV65)>; +def UseHVXIEEEFP : Predicate<"HST->useHVXIEEEFPOps()">, + AssemblerPredicate<(all_of ExtensionHVXIEEEFP)>; +def UseHVXQFloat : Predicate<"HST->useHVXQFloatOps()">, + AssemblerPredicate<(all_of ExtensionHVXQFloat)>; +def UseHVXFloatingPoint: Predicate<"HST->useHVXFloatingPoint()">; def HasMemNoShuf : Predicate<"HST->hasMemNoShuf()">, AssemblerPredicate<(all_of FeatureMemNoShuf)>; def UseUnsafeMath : Predicate<"HST->useUnsafeMath()">; @@ -127,6 +145,8 @@ def NotOptTinyCore : Predicate<"!HST->isTinyCore() ||" let RecomputePerFunction = 1; } def UseSmallData : Predicate<"HST->useSmallData()">; +def UseCabac : Predicate<"HST->useCabac()">, + AssemblerPredicate<(any_of FeatureCabac)>; def Hvx64: HwMode<"+hvx-length64b">; def Hvx128: HwMode<"+hvx-length128b">; @@ -299,7 +319,7 @@ def changeAddrMode_rr_ur: InstrMapping { let ValueCols = [["BaseLongOffset"]]; } -def changeAddrMode_ur_rr : InstrMapping { +def changeAddrMode_ur_rr: InstrMapping { let FilterClass = "ImmRegShl"; let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"]; let ColFields = ["addrMode"]; @@ -370,40 +390,55 @@ class Proc<string Name, SchedMachineModel Model, def : Proc<"generic", HexagonModelV60, [ArchV5, ArchV55, ArchV60, FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; def : Proc<"hexagonv5", HexagonModelV5, [ArchV5, FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; def : Proc<"hexagonv55", HexagonModelV55, [ArchV5, ArchV55, FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; def : Proc<"hexagonv60", HexagonModelV60, [ArchV5, ArchV55, ArchV60, FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; def : Proc<"hexagonv62", HexagonModelV62, [ArchV5, ArchV55, ArchV60, ArchV62, FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; def : Proc<"hexagonv65", HexagonModelV65, [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; def : Proc<"hexagonv66", HexagonModelV66, [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; def : Proc<"hexagonv67", HexagonModelV67, [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67, FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; def : Proc<"hexagonv68", HexagonModelV68, [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67, ArchV68, FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, - FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>; + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; +def : Proc<"hexagonv69", HexagonModelV69, + [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67, + ArchV68, ArchV69, + FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops, + FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData, + FeatureCabac]>; // Need to update the correct features for tiny core. // Disable NewValueJumps since the packetizer is unable to handle a packet with // a new value jump and another SLOT0 instruction. diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 8e6a01e3a186..411078052e0f 100644 --- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -773,6 +773,67 @@ void HexagonAsmPrinter::emitInstruction(const MachineInstr *MI) { OutStreamer->emitInstruction(MCB, getSubtargetInfo()); } +void HexagonAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) { + static const int8_t NoopsInSledCount = 4; + // We want to emit the following pattern: + // + // .L_xray_sled_N: + // <xray_sled_base>: + // { jump .Ltmp0 } + // { nop + // nop + // nop + // nop } + // .Ltmp0: + // + // We need the 4 nop words because at runtime, we'd be patching over the + // full 5 words with the following pattern: + // + // <xray_sled_n>: + // { immext(#...) // upper 26-bits of trampoline + // r6 = ##... // lower 6-bits of trampoline + // immext(#...) // upper 26-bits of func id + // r7 = ##... } // lower 6 bits of func id + // { callr r6 } + // + // + auto CurSled = OutContext.createTempSymbol("xray_sled_", true); + OutStreamer->emitLabel(CurSled); + + MCInst *SledJump = new (OutContext) MCInst(); + SledJump->setOpcode(Hexagon::J2_jump); + auto PostSled = OutContext.createTempSymbol(); + SledJump->addOperand(MCOperand::createExpr(HexagonMCExpr::create( + MCSymbolRefExpr::create(PostSled, OutContext), OutContext))); + + // Emit "jump PostSled" instruction, which jumps over the nop series. + MCInst SledJumpPacket; + SledJumpPacket.setOpcode(Hexagon::BUNDLE); + SledJumpPacket.addOperand(MCOperand::createImm(0)); + SledJumpPacket.addOperand(MCOperand::createInst(SledJump)); + + EmitToStreamer(*OutStreamer, SledJumpPacket); + + // FIXME: this will emit individual packets, we should + // special-case this and combine them into a single packet. + emitNops(NoopsInSledCount); + + OutStreamer->emitLabel(PostSled); + recordSled(CurSled, MI, Kind, 0); +} + +void HexagonAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) { + EmitSled(MI, SledKind::FUNCTION_ENTER); +} + +void HexagonAsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI) { + EmitSled(MI, SledKind::FUNCTION_EXIT); +} + +void HexagonAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI) { + EmitSled(MI, SledKind::TAIL_CALL); +} + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmPrinter() { RegisterAsmPrinter<HexagonAsmPrinter> X(getTheHexagonTarget()); } diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h index 3932def87854..93d5f1dce7af 100644 --- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h +++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h @@ -36,7 +36,11 @@ class TargetMachine; bool runOnMachineFunction(MachineFunction &Fn) override { Subtarget = &Fn.getSubtarget<HexagonSubtarget>(); - return AsmPrinter::runOnMachineFunction(Fn); + const bool Modified = AsmPrinter::runOnMachineFunction(Fn); + // Emit the XRay table for this function. + emitXRayTable(); + + return Modified; } StringRef getPassName() const override { @@ -47,6 +51,16 @@ class TargetMachine; const override; void emitInstruction(const MachineInstr *MI) override; + + //===------------------------------------------------------------------===// + // XRay implementation + //===------------------------------------------------------------------===// + // XRay-specific lowering for Hexagon. + void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI); + void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI); + void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI); + void EmitSled(const MachineInstr &MI, SledKind Kind); + void HexagonProcessInstruction(MCInst &Inst, const MachineInstr &MBB); void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp index 2c5ad3b589d2..428d25da6dbc 100644 --- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -995,8 +995,8 @@ bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) { MachineBasicBlock *B = N->getBlock(); std::vector<MachineInstr*> Instrs; - for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) - Instrs.push_back(&*I); + for (MachineInstr &MI : llvm::reverse(*B)) + Instrs.push_back(&MI); for (auto MI : Instrs) { unsigned Opc = MI->getOpcode(); @@ -3084,8 +3084,7 @@ void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB, .addMBB(&LB); RegMap.insert(std::make_pair(G.Inp.Reg, PhiR)); - for (unsigned i = G.Ins.size(); i > 0; --i) { - const MachineInstr *SI = G.Ins[i-1]; + for (const MachineInstr *SI : llvm::reverse(G.Ins)) { unsigned DR = getDefReg(SI); const TargetRegisterClass *RC = MRI->getRegClass(DR); Register NewDR = MRI->createVirtualRegister(RC); @@ -3156,20 +3155,20 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) { // if that instruction could potentially be moved to the front of the loop: // the output of the loop cannot be used in a non-shuffling instruction // in this loop. - for (auto I = C.LB->rbegin(), E = C.LB->rend(); I != E; ++I) { - if (I->isTerminator()) + for (MachineInstr &MI : llvm::reverse(*C.LB)) { + if (MI.isTerminator()) continue; - if (I->isPHI()) + if (MI.isPHI()) break; RegisterSet Defs; - HBS::getInstrDefs(*I, Defs); + HBS::getInstrDefs(MI, Defs); if (Defs.count() != 1) continue; Register DefR = Defs.find_first(); if (!DefR.isVirtual()) continue; - if (!isBitShuffle(&*I, DefR)) + if (!isBitShuffle(&MI, DefR)) continue; bool BadUse = false; @@ -3183,8 +3182,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) { if (UseI->getOperand(Idx+1).getMBB() != C.LB) BadUse = true; } else { - auto F = find(ShufIns, UseI); - if (F == ShufIns.end()) + if (!llvm::is_contained(ShufIns, UseI)) BadUse = true; } } else { @@ -3199,7 +3197,7 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) { if (BadUse) continue; - ShufIns.push_back(&*I); + ShufIns.push_back(&MI); } // Partition the list of shuffling instructions into instruction groups, diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp index 8c3b9572201e..a53efeb96961 100644 --- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -1256,15 +1256,11 @@ void HexagonCommonGEP::removeDeadCode() { BO.push_back(DTN->getBlock()); } - for (unsigned i = BO.size(); i > 0; --i) { - BasicBlock *B = cast<BasicBlock>(BO[i-1]); - BasicBlock::InstListType &IL = B->getInstList(); - - using reverse_iterator = BasicBlock::InstListType::reverse_iterator; - + for (Value *V : llvm::reverse(BO)) { + BasicBlock *B = cast<BasicBlock>(V); ValueVect Ins; - for (reverse_iterator I = IL.rbegin(), E = IL.rend(); I != E; ++I) - Ins.push_back(&*I); + for (Instruction &I : llvm::reverse(*B)) + Ins.push_back(&I); for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) { Instruction *In = cast<Instruction>(*I); if (isInstructionTriviallyDead(In)) diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h index 7a43a4440b2d..56174dc7e136 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepArch.h +++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h @@ -21,31 +21,32 @@ namespace llvm { namespace Hexagon { -enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67, V68 }; +enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67, V68, V69 }; -static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67, 68}; +static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67, 68, 69}; static constexpr ArrayRef<unsigned> ArchValsNum(ArchValsNumArray); -static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v68" }; +static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v68", "v69" }; static constexpr ArrayRef<StringLiteral> ArchValsText(ArchValsTextArray); -static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t", "hexagonv68" }; +static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t", "hexagonv68", "hexagonv69" }; static constexpr ArrayRef<StringLiteral> CpuValsText(CpuValsTextArray); -static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t", "v68" }; +static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t", "v68", "v69" }; static constexpr ArrayRef<StringLiteral> CpuNickText(CpuNickTextArray); static const std::map<std::string, ArchEnum> CpuTable{ - {"generic", Hexagon::ArchEnum::V5}, - {"hexagonv5", Hexagon::ArchEnum::V5}, - {"hexagonv55", Hexagon::ArchEnum::V55}, - {"hexagonv60", Hexagon::ArchEnum::V60}, - {"hexagonv62", Hexagon::ArchEnum::V62}, - {"hexagonv65", Hexagon::ArchEnum::V65}, - {"hexagonv66", Hexagon::ArchEnum::V66}, - {"hexagonv67", Hexagon::ArchEnum::V67}, - {"hexagonv67t", Hexagon::ArchEnum::V67}, - {"hexagonv68", Hexagon::ArchEnum::V68}, + {"generic", Hexagon::ArchEnum::V5}, + {"hexagonv5", Hexagon::ArchEnum::V5}, + {"hexagonv55", Hexagon::ArchEnum::V55}, + {"hexagonv60", Hexagon::ArchEnum::V60}, + {"hexagonv62", Hexagon::ArchEnum::V62}, + {"hexagonv65", Hexagon::ArchEnum::V65}, + {"hexagonv66", Hexagon::ArchEnum::V66}, + {"hexagonv67", Hexagon::ArchEnum::V67}, + {"hexagonv67t", Hexagon::ArchEnum::V67}, + {"hexagonv68", Hexagon::ArchEnum::V68}, + {"hexagonv69", Hexagon::ArchEnum::V69}, }; static const std::map<std::string, unsigned> ElfFlagsByCpuStr = { @@ -59,6 +60,7 @@ static const std::map<std::string, unsigned> ElfFlagsByCpuStr = { {"hexagonv67", llvm::ELF::EF_HEXAGON_MACH_V67}, {"hexagonv67t", llvm::ELF::EF_HEXAGON_MACH_V67T}, {"hexagonv68", llvm::ELF::EF_HEXAGON_MACH_V68}, + {"hexagonv69", llvm::ELF::EF_HEXAGON_MACH_V69}, }; static const std::map<unsigned, std::string> ElfArchByMachFlags = { {llvm::ELF::EF_HEXAGON_MACH_V5, "V5"}, @@ -70,6 +72,7 @@ static const std::map<unsigned, std::string> ElfArchByMachFlags = { {llvm::ELF::EF_HEXAGON_MACH_V67, "V67"}, {llvm::ELF::EF_HEXAGON_MACH_V67T, "V67T"}, {llvm::ELF::EF_HEXAGON_MACH_V68, "V68"}, + {llvm::ELF::EF_HEXAGON_MACH_V69, "V69"}, }; static const std::map<unsigned, std::string> ElfCpuByMachFlags = { {llvm::ELF::EF_HEXAGON_MACH_V5, "hexagonv5"}, @@ -81,6 +84,7 @@ static const std::map<unsigned, std::string> ElfCpuByMachFlags = { {llvm::ELF::EF_HEXAGON_MACH_V67, "hexagonv67"}, {llvm::ELF::EF_HEXAGON_MACH_V67T, "hexagonv67t"}, {llvm::ELF::EF_HEXAGON_MACH_V68, "hexagonv68"}, + {llvm::ELF::EF_HEXAGON_MACH_V69, "hexagonv69"}, }; } // namespace Hexagon diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td index e743a291f1e5..e4f24e3c2e66 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepArch.td +++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td @@ -24,3 +24,5 @@ def ArchV67: SubtargetFeature<"v67", "HexagonArchVersion", "Hexagon::ArchEnum::V def HasV67 : Predicate<"HST->hasV67Ops()">, AssemblerPredicate<(all_of ArchV67)>; def ArchV68: SubtargetFeature<"v68", "HexagonArchVersion", "Hexagon::ArchEnum::V68", "Enable Hexagon V68 architecture">; def HasV68 : Predicate<"HST->hasV68Ops()">, AssemblerPredicate<(all_of ArchV68)>; +def ArchV69: SubtargetFeature<"v69", "HexagonArchVersion", "Hexagon::ArchEnum::V69", "Enable Hexagon V69 architecture">; +def HasV69 : Predicate<"HST->hasV69Ops()">, AssemblerPredicate<(all_of ArchV69)>; diff --git a/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc b/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc index 40f6e14aed13..7164af3ad5c6 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc +++ b/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc @@ -8,6 +8,7 @@ // Automatically generated file, do not edit! //===----------------------------------------------------------------------===// + #if defined(__clang__) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunused-function" diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td index a1db3ae7239d..d195df918293 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td @@ -11,6 +11,7 @@ def tc_04da405a : InstrItinClass; def tc_05ca8cfd : InstrItinClass; def tc_08a4f1b6 : InstrItinClass; +def tc_0afc8be9 : InstrItinClass; def tc_0b04c6c7 : InstrItinClass; def tc_0ec46cf9 : InstrItinClass; def tc_131f1c81 : InstrItinClass; @@ -21,6 +22,7 @@ def tc_191381c1 : InstrItinClass; def tc_1ad8a370 : InstrItinClass; def tc_1ba8a0cd : InstrItinClass; def tc_20a4bbec : InstrItinClass; +def tc_2120355e : InstrItinClass; def tc_257f6f7c : InstrItinClass; def tc_26a377fe : InstrItinClass; def tc_2b4c548e : InstrItinClass; @@ -28,15 +30,18 @@ def tc_2c745bb8 : InstrItinClass; def tc_2d4051cd : InstrItinClass; def tc_2e8f5f6e : InstrItinClass; def tc_309dbb4f : InstrItinClass; +def tc_37820f4c : InstrItinClass; def tc_3904b926 : InstrItinClass; def tc_3aacf4a8 : InstrItinClass; def tc_3ad719fb : InstrItinClass; def tc_3c56e5ce : InstrItinClass; +def tc_3c8c15d0 : InstrItinClass; def tc_3ce09744 : InstrItinClass; def tc_3e2aaafc : InstrItinClass; def tc_447d9895 : InstrItinClass; def tc_453fe68d : InstrItinClass; def tc_46d6c3e0 : InstrItinClass; +def tc_4942646a : InstrItinClass; def tc_51d0ecc3 : InstrItinClass; def tc_52447ecc : InstrItinClass; def tc_540c3da3 : InstrItinClass; @@ -46,6 +51,7 @@ def tc_56c4f9fe : InstrItinClass; def tc_56e64202 : InstrItinClass; def tc_58d21193 : InstrItinClass; def tc_5bf8afbb : InstrItinClass; +def tc_5cdf8c84 : InstrItinClass; def tc_61bf7c03 : InstrItinClass; def tc_649072c2 : InstrItinClass; def tc_660769f1 : InstrItinClass; @@ -57,6 +63,8 @@ def tc_71646d06 : InstrItinClass; def tc_7177e272 : InstrItinClass; def tc_718b5c53 : InstrItinClass; def tc_7273323b : InstrItinClass; +def tc_72e2b393 : InstrItinClass; +def tc_73efe966 : InstrItinClass; def tc_7417e785 : InstrItinClass; def tc_767c4e9d : InstrItinClass; def tc_7d68d5c2 : InstrItinClass; @@ -71,9 +79,11 @@ def tc_9d1dc972 : InstrItinClass; def tc_9f363d21 : InstrItinClass; def tc_a02a10a8 : InstrItinClass; def tc_a0dbea28 : InstrItinClass; +def tc_a19b9305 : InstrItinClass; def tc_a28f32b5 : InstrItinClass; def tc_a69eeee1 : InstrItinClass; def tc_a7e6707d : InstrItinClass; +def tc_aa047364 : InstrItinClass; def tc_ab23f776 : InstrItinClass; def tc_abe8c3b2 : InstrItinClass; def tc_ac4046bc : InstrItinClass; @@ -89,8 +99,10 @@ def tc_c4edf264 : InstrItinClass; def tc_c5dba46e : InstrItinClass; def tc_c7039829 : InstrItinClass; def tc_cd94bfe0 : InstrItinClass; +def tc_cda936da : InstrItinClass; def tc_d8287c14 : InstrItinClass; def tc_db5555f3 : InstrItinClass; +def tc_dcca380f : InstrItinClass; def tc_dd5b0695 : InstrItinClass; def tc_df80eeb0 : InstrItinClass; def tc_e2d2e9e5 : InstrItinClass; @@ -99,6 +111,7 @@ def tc_e3f68a46 : InstrItinClass; def tc_e675c45a : InstrItinClass; def tc_e699ae41 : InstrItinClass; def tc_e99d4c2e : InstrItinClass; +def tc_f175e046 : InstrItinClass; def tc_f1de44ef : InstrItinClass; def tc_f21e8abb : InstrItinClass; @@ -119,6 +132,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -174,6 +192,10 @@ class DepHVXItinV55 { InstrStage<1, [CVI_ST]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_2120355e, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], @@ -209,6 +231,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], @@ -231,6 +258,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2], @@ -259,6 +291,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5], @@ -306,6 +343,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_XLANE]>], [9, 2], [HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], @@ -363,6 +405,16 @@ class DepHVXItinV55 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], @@ -437,6 +489,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_ZW]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/ [InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, @@ -456,6 +513,10 @@ class DepHVXItinV55 { InstrStage<1, [CVI_XLANE]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_aa047364, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2, 5], @@ -537,6 +598,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5], @@ -547,6 +613,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_ZW]>], [2, 1, 2], @@ -589,6 +660,11 @@ class DepHVXItinV55 { InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ [InstrStage<1, [SLOT2], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -620,6 +696,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -675,6 +756,10 @@ class DepHVXItinV60 { InstrStage<1, [CVI_ST]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_2120355e, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], @@ -710,6 +795,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], @@ -732,6 +822,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2], @@ -760,6 +855,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5], @@ -807,6 +907,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_XLANE]>], [9, 2], [HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], @@ -864,6 +969,16 @@ class DepHVXItinV60 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], @@ -938,6 +1053,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_ZW]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/ [InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, @@ -957,6 +1077,10 @@ class DepHVXItinV60 { InstrStage<1, [CVI_XLANE]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_aa047364, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2, 5], @@ -1038,6 +1162,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5], @@ -1048,6 +1177,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_ZW]>], [2, 1, 2], @@ -1090,6 +1224,11 @@ class DepHVXItinV60 { InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ [InstrStage<1, [SLOT2], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -1121,6 +1260,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -1176,6 +1320,10 @@ class DepHVXItinV62 { InstrStage<1, [CVI_ST]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_2120355e, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], @@ -1211,6 +1359,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], @@ -1233,6 +1386,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2], @@ -1261,6 +1419,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5], @@ -1308,6 +1471,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_XLANE]>], [9, 2], [HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], @@ -1365,6 +1533,16 @@ class DepHVXItinV62 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], @@ -1439,6 +1617,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_ZW]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/ [InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, @@ -1458,6 +1641,10 @@ class DepHVXItinV62 { InstrStage<1, [CVI_XLANE]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_aa047364, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2, 5], @@ -1539,6 +1726,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5], @@ -1549,6 +1741,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_ZW]>], [2, 1, 2], @@ -1591,6 +1788,11 @@ class DepHVXItinV62 { InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ [InstrStage<1, [SLOT2], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -1622,6 +1824,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -1677,6 +1884,10 @@ class DepHVXItinV65 { InstrStage<1, [CVI_ST]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_2120355e, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], @@ -1712,6 +1923,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], @@ -1734,6 +1950,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2], @@ -1762,6 +1983,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5], @@ -1809,6 +2035,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_XLANE]>], [9, 2], [HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], @@ -1866,6 +2097,16 @@ class DepHVXItinV65 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], @@ -1940,6 +2181,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_ZW]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/ [InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, @@ -1959,6 +2205,10 @@ class DepHVXItinV65 { InstrStage<1, [CVI_XLANE]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_aa047364, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2, 5], @@ -2040,6 +2290,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5], @@ -2050,6 +2305,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_ZW]>], [2, 1, 2], @@ -2092,6 +2352,11 @@ class DepHVXItinV65 { InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ [InstrStage<1, [SLOT2], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -2123,6 +2388,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -2178,6 +2448,10 @@ class DepHVXItinV66 { InstrStage<1, [CVI_ST]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_2120355e, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], @@ -2213,6 +2487,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], @@ -2235,6 +2514,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2], @@ -2263,6 +2547,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5], @@ -2310,6 +2599,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_XLANE]>], [9, 2], [HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], @@ -2367,6 +2661,16 @@ class DepHVXItinV66 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], @@ -2441,6 +2745,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_ZW]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/ [InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, @@ -2460,6 +2769,10 @@ class DepHVXItinV66 { InstrStage<1, [CVI_XLANE]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_aa047364, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2, 5], @@ -2541,6 +2854,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5], @@ -2551,6 +2869,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_ZW]>], [2, 1, 2], @@ -2593,6 +2916,11 @@ class DepHVXItinV66 { InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ [InstrStage<1, [SLOT2], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -2624,6 +2952,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -2679,6 +3012,10 @@ class DepHVXItinV67 { InstrStage<1, [CVI_ST]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_2120355e, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], @@ -2714,6 +3051,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], @@ -2736,6 +3078,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2], @@ -2764,6 +3111,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5], @@ -2811,6 +3163,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_XLANE]>], [9, 2], [HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], @@ -2868,6 +3225,16 @@ class DepHVXItinV67 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], @@ -2942,6 +3309,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_ZW]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_a28f32b5, /*SLOT1,LOAD,VA*/ [InstrStage<1, [SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, @@ -2961,6 +3333,10 @@ class DepHVXItinV67 { InstrStage<1, [CVI_XLANE]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_aa047364, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2, 5], @@ -3042,6 +3418,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5], @@ -3052,6 +3433,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_ZW]>], [2, 1, 2], @@ -3094,6 +3480,11 @@ class DepHVXItinV67 { InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ [InstrStage<1, [SLOT2], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -3125,6 +3516,575 @@ class DepHVXItinV68 { InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL]>], [], + []>, + + InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7], + [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2], + [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_2120355e, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_37820f4c, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7], + [HVX_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_540c3da3, /*SLOT0,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1], + [Hex_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_56e64202, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_649072c2, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>], [9, 3, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_7177e272, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9], + [HVX_FWD]>, + + InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_72e2b393, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL]>], [3, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_8772086c, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/ + [InstrStage<1, [SLOT2], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_946013d8, /*SLOT0123,VP*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7], + [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_ZW]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7], + [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_aa047364, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ac4046bc, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7], + [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_b091f1c6, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2], + [HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c127de3a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_c4edf264, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2], + [HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7], + [Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>, + InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_dcca380f, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_ZW]>], [2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_ALL]>], [3], + [HVX_FWD]>, + + InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_e699ae41, /*SLOT01,ZW*/ + [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_ZW]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ + [InstrStage<1, [SLOT2], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/ + [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>], [1, 2, 5], + [Hex_FWD, Hex_FWD, HVX_FWD]> + ]; +} + +class DepHVXItinV69 { + list<InstrItinData> DepHVXItinV69_list = [ + InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + + InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], @@ -3180,6 +4140,10 @@ class DepHVXItinV68 { InstrStage<1, [CVI_ST]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_2120355e, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7], @@ -3215,6 +4179,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2], [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_37820f4c, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3904b926, /*SLOT01,LOAD*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_LD]>], [9, 2, 1, 2], @@ -3237,6 +4206,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_3ce09744, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2], @@ -3265,6 +4239,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_XLANE]>], [9, 5, 5], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_4942646a, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5], @@ -3312,6 +4291,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_XLANE]>], [9, 2], [HVX_FWD, Hex_FWD]>, + InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7], + [HVX_FWD, HVX_FWD]>, + InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2], @@ -3369,6 +4353,16 @@ class DepHVXItinV68 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7], [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_72e2b393, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + + InstrItinData <tc_73efe966, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_7417e785, /*SLOT0123,VS*/ [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>, InstrStage<1, [CVI_SHIFT]>], [9, 5, 2], @@ -3443,6 +4437,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_ZW]>], [3, 1, 2], [Hex_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_a19b9305, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5], + [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_LD], 0>, @@ -3462,6 +4461,10 @@ class DepHVXItinV68 { InstrStage<1, [CVI_XLANE]>], [9, 1, 2], [HVX_FWD, Hex_FWD, Hex_FWD]>, + InstrItinData <tc_aa047364, /*SLOT0123*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_ab23f776, /*SLOT0,STORE*/ [InstrStage<1, [SLOT0], 0>, InstrStage<1, [CVI_ST]>], [1, 2, 5], @@ -3543,6 +4546,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2], [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_cda936da, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7], + [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/ [InstrStage<1, [SLOT2, SLOT3], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 5], @@ -3553,6 +4561,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7], [HVX_FWD, HVX_FWD, HVX_FWD]>, + InstrItinData <tc_dcca380f, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2], + [HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/ [InstrStage<1, [SLOT0, SLOT1], 0>, InstrStage<1, [CVI_ZW]>], [2, 1, 2], @@ -3595,6 +4608,11 @@ class DepHVXItinV68 { InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5], [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>, + InstrItinData <tc_f175e046, /*SLOT23,VX*/ + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2], + [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>, + InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/ [InstrStage<1, [SLOT2], 0>, InstrStage<1, [CVI_MPY01]>], [9, 5, 2], diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td index a3766652794b..a979bafe8e33 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td +++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td @@ -7338,3 +7338,771 @@ class DepScalarItinV68 { [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]> ]; } + +class DepScalarItinV69 { + list<InstrItinData> DepScalarItinV69_list = [ + InstrItinData <tc_011e0e9d, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_01d44cb2, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_01e1be3b, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_02fe1c65, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0655b949, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 3], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_075c8dd8, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0a195f2c, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0a6c20ae, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0ba0d5da, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_0dfac0a7, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_0fac1eb8, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_112d30d6, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_1242dc2a, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_1248597c, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_14ab4f41, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 3, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_151bf368, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_158aa3f7, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_197dce51, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1981450d, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3], + [Hex_FWD]>, + + InstrItinData <tc_1c2c7a4a, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1c7522a8, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1d41f8b7, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1fcb8495, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_1fe4ab69, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_20131976, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_2237d952, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_23708a21, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [], + []>, + + InstrItinData <tc_2471c1c8, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_24e109c7, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_24f426ab, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_280f7fe1, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_28e55c6f, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [1, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_2c13e7f5, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_2c3e17fc, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_2f573607, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_362b0be2, /*tc_3*/ + [InstrStage<1, [SLOT2]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_38382228, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_388f9897, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_38e0bae9, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3d14a17b, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3edca78f, /*tc_2*/ + [InstrStage<1, [SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_3fbf1042, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3], + [Hex_FWD]>, + + InstrItinData <tc_407e96f9, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_40d64c94, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4222e6bf, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_42ff66ba, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_442395f3, /*tc_2latepred*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_449acf79, /*tc_latepredstaia*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_44d5a428, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_44fffc58, /*tc_3*/ + [InstrStage<1, [SLOT2, SLOT3]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_45791fb8, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_45f9d1be, /*tc_2early*/ + [InstrStage<1, [SLOT2]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_49fdfd4b, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4a55d03c, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4abdbdc6, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4ac61d92, /*tc_2latepred*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_4bf903b0, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3], + [Hex_FWD]>, + + InstrItinData <tc_503ce0f3, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_53c851ab, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5502c366, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_55255f2b, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [], + []>, + + InstrItinData <tc_556f6577, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_55a9a350, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_55b33fda, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_56a124a7, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_57a55b54, /*tc_1*/ + [InstrStage<1, [SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5944960d, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_59a7822c, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5a4b5e58, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5b347363, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5ceb2f9e, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5da50c4b, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5deb5e47, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5e4cf0e8, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_60e324ff, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_63567288, /*tc_2latepred*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4], + [Hex_FWD]>, + + InstrItinData <tc_64b00d8a, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_651cbe02, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_65279839, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_65cbd974, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_69bfb303, /*tc_3*/ + [InstrStage<1, [SLOT2, SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6ae3426b, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6d861a95, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6e20402a, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [2, 3], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6f42bc60, /*tc_3stall*/ + [InstrStage<1, [SLOT0]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_6fc5dbea, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_711c805f, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_713b66bf, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7401744f, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7476d766, /*tc_3stall*/ + [InstrStage<1, [SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_74a42bda, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_76bb5435, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_77f94a5e, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [], + []>, + + InstrItinData <tc_788b1d09, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7af3a37e, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 3], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7b9187d3, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7c31e19a, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7c6d32e4, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7f7f45f5, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_7f8ae742, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8035e91f, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_822c3c68, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_829d8a86, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_838c4d7a, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_84a7500d, /*tc_2*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_86173609, /*tc_2latepred*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_887d1bb7, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8a6d0d94, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8a825db2, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8b5bd4f5, /*tc_2*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_8e82e8ca, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9124c04f, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_92240447, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_934753bb, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [3, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_937dd41c, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [], + []>, + + InstrItinData <tc_9406230a, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_95a33176, /*tc_2*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_96ef76ef, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_975a4e54, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 3, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9783714b, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9b34f5e0, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [], + []>, + + InstrItinData <tc_9b3c0462, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9bcfb2ee, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9c52f549, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9e27f2f9, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9e72dc89, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9edb7c77, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9edefe01, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_9f6cd987, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a08b630b, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a1297125, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a154b476, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a2b365d2, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a3070909, /*tc_3stall*/ + [InstrStage<1, [SLOT0]>], [1, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a32e03e7, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a38c45dc, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a4e22bbd, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a4ee89db, /*tc_2early*/ + [InstrStage<1, [SLOT0]>], [], + []>, + + InstrItinData <tc_a7a13fac, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a7bdb22c, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_a9edeffa, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_abfd9a6d, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ac65613f, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_addc37a8, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ae5babd7, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_aee6250c, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_b1ae5f67, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_b4dc7630, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_b7c4062a, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_b837298f, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [], + []>, + + InstrItinData <tc_ba9255a6, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bb07f2c5, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bb831a7c, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_bf2ffc0f, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c20701f0, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c21d7447, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c57d9f39, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_c818ff7f, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [], + []>, + + InstrItinData <tc_ce59038e, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_cfa0e29b, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [2, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d03278fd, /*tc_st*/ + [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d33e5eee, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d3632d88, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d45ba9cd, /*tc_ld*/ + [InstrStage<1, [SLOT0]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_d57d649c, /*tc_3stall*/ + [InstrStage<1, [SLOT2]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_d61dfdc3, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d68dca5c, /*tc_3stall*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_d7718fbe, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_db596beb, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_db96aa6b, /*tc_st*/ + [InstrStage<1, [SLOT0]>], [1], + [Hex_FWD]>, + + InstrItinData <tc_dc51281d, /*tc_3*/ + [InstrStage<1, [SLOT2]>], [2, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_decdde8a, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_df5d53f9, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 2, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_e3d699e3, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_e9170fb7, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ed03645c, /*tc_1*/ + [InstrStage<1, [SLOT2]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_eed07714, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_eeda4109, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_ef921005, /*tc_1*/ + [InstrStage<1, [SLOT2, SLOT3]>], [3, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f098b237, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f0cdeccf, /*tc_3x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f0e8e832, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f34c1c21, /*tc_2*/ + [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f38f92e1, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [2], + [Hex_FWD]>, + + InstrItinData <tc_f529831b, /*tc_latepredstaia*/ + [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f6e2aff9, /*tc_newvjump*/ + [InstrStage<1, [SLOT0]>], [3, 2, 2], + [Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f7569068, /*tc_4x*/ + [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_f999c66e, /*tc_1*/ + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_fae9dfa5, /*tc_3x*/ + [InstrStage<1, [SLOT3]>], [4, 2], + [Hex_FWD, Hex_FWD]>, + + InstrItinData <tc_fedb7e19, /*tc_ld*/ + [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2], + [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]> + ]; +} diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td index b3f1b6638193..65d36924ba48 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td +++ b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td @@ -2288,6 +2288,12 @@ class Enc_a30110 : OpcodeHexagon { bits <5> Vd32; let Inst{4-0} = Vd32{4-0}; } +class Enc_a33d04 : OpcodeHexagon { + bits <5> Vuu32; + let Inst{12-8} = Vuu32{4-0}; + bits <5> Vd32; + let Inst{4-0} = Vd32{4-0}; +} class Enc_a42857 : OpcodeHexagon { bits <11> Ii; let Inst{21-20} = Ii{10-9}; @@ -3109,6 +3115,14 @@ class Enc_de0214 : OpcodeHexagon { bits <5> Rd32; let Inst{4-0} = Rd32{4-0}; } +class Enc_de5ea0 : OpcodeHexagon { + bits <5> Vuu32; + let Inst{12-8} = Vuu32{4-0}; + bits <5> Vv32; + let Inst{20-16} = Vv32{4-0}; + bits <5> Vd32; + let Inst{4-0} = Vd32{4-0}; +} class Enc_e07374 : OpcodeHexagon { bits <5> Rs32; let Inst{20-16} = Rs32{4-0}; diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td index 4f00409c336c..c02988266584 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -5824,8 +5824,8 @@ let Inst{31-21} = 0b01010100100; let hasNewValue = 1; let opNewValue = 0; let isSolo = 1; -let Uses = [GOSP]; -let Defs = [GOSP, PC]; +let Uses = [CCR, GOSP]; +let Defs = [CCR, GOSP, PC]; let hasSideEffects = 1; let Constraints = "$Rx32 = $Rx32in"; } @@ -8500,6 +8500,8 @@ let Inst{31-21} = 0b01010010101; let isTerminator = 1; let isIndirectBranch = 1; let isBranch = 1; +let cofRelax1 = 1; +let cofRelax2 = 1; let cofMax1 = 1; } def J4_jumpseti : HInst< @@ -18210,16 +18212,6 @@ let opExtentBits = 18; let opExtentAlign = 2; let opNewValue = 1; } -def PS_trap1 : HInst< -(outs), -(ins u8_0Imm:$Ii), -"trap1(#$Ii)", -tc_53c851ab, TypeJ>, Enc_a51a9a, Requires<[HasPreV65]> { -let Inst{1-0} = 0b00; -let Inst{7-5} = 0b000; -let Inst{13-13} = 0b0; -let Inst{31-16} = 0b0101010010000000; -} def R6_release_at_vi : HInst< (outs), (ins IntRegs:$Rs32), @@ -18964,7 +18956,7 @@ def S2_cabacdecbin : HInst< (outs DoubleRegs:$Rdd32), (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32), "$Rdd32 = decbin($Rss32,$Rtt32)", -tc_db596beb, TypeS_3op>, Enc_a56825 { +tc_db596beb, TypeS_3op>, Enc_a56825, Requires<[UseCabac]> { let Inst{7-5} = 0b110; let Inst{13-13} = 0b0; let Inst{31-21} = 0b11000001110; @@ -26883,17 +26875,6 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } -def V6_ldntnt0 : HInst< -(outs HvxVR:$Vd32), -(ins IntRegs:$Rt32), -"$Vd32 = vmem($Rt32):nt", -PSEUDO, TypeMAPPING>, Requires<[HasV62]> { -let hasNewValue = 1; -let opNewValue = 0; -let isPseudo = 1; -let isCodeGenOnly = 1; -let DecoderNamespace = "EXT_mmvec"; -} def V6_ldp0 : HInst< (outs HvxVR:$Vd32), (ins PredRegs:$Pv4, IntRegs:$Rt32), @@ -27312,6 +27293,30 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_v10mpyubs10 : HInst< +(outs HvxWR:$Vdd32), +(ins HvxWR:$Vuu32, HvxWR:$Vvv32, u1_0Imm:$Ii), +"$Vdd32.w = v10mpy($Vuu32.ub,$Vvv32.b,#$Ii)", +tc_f175e046, TypeCVI_VX>, Requires<[UseHVXV69]> { +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let isPseudo = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_v10mpyubs10_vxx : HInst< +(outs HvxWR:$Vxx32), +(ins HvxWR:$Vxx32in, HvxWR:$Vuu32, HvxWR:$Vvv32, u1_0Imm:$Ii), +"$Vxx32.w += v10mpy($Vuu32.ub,$Vvv32.b,#$Ii)", +tc_4942646a, TypeCVI_VX>, Requires<[UseHVXV69]> { +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let isCVI = 1; +let isPseudo = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vxx32 = $Vxx32in"; +} def V6_v6mpyhubs10 : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32, HvxWR:$Vvv32, u2_0Imm:$Ii), @@ -27396,7 +27401,7 @@ def V6_vL32Ub_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32 = vmemu($Rt32+#$Ii)", -tc_a7e6707d, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[UseHVXV60]> { +tc_a7e6707d, TypeCVI_VM_VP_LDU>, Enc_f3f408, Requires<[UseHVXV60]>, PostInc_BaseImm { let Inst{7-5} = 0b111; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000000; @@ -27408,13 +27413,15 @@ let isCVLoad = 1; let isCVI = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; +let BaseOpcode = "V6_vL32Ub_ai"; +let CextOpcode = "V6_vL32Ub"; let DecoderNamespace = "EXT_mmvec"; } def V6_vL32Ub_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32 = vmemu($Rx32++#$Ii)", -tc_3c56e5ce, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[UseHVXV60]> { +tc_3c56e5ce, TypeCVI_VM_VP_LDU>, Enc_a255dc, Requires<[UseHVXV60]>, PostInc_BaseImm { let Inst{7-5} = 0b111; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001000; @@ -27427,6 +27434,7 @@ let isCVI = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_pi"; +let CextOpcode = "V6_vL32Ub"; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Rx32 = $Rx32in"; } @@ -27452,7 +27460,7 @@ def V6_vL32b_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32 = vmem($Rt32+#$Ii)", -tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000000; @@ -27465,6 +27473,7 @@ let isCVI = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_ai"; +let CextOpcode = "V6_vL32b"; let isCVLoadable = 1; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; @@ -27473,7 +27482,7 @@ def V6_vL32b_cur_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32.cur = vmem($Rt32+#$Ii)", -tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b001; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000000; @@ -27487,6 +27496,7 @@ let CVINew = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_cur_ai"; +let CextOpcode = "V6_vL32b_cur"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; } @@ -27560,7 +27570,7 @@ def V6_vL32b_cur_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32.cur = vmem($Rx32++#$Ii)", -tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b001; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001000; @@ -27574,6 +27584,7 @@ let CVINew = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_cur_pi"; +let CextOpcode = "V6_vL32b_cur"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Rx32 = $Rx32in"; @@ -27729,7 +27740,7 @@ def V6_vL32b_nt_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32 = vmem($Rt32+#$Ii):nt", -tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000010; @@ -27743,6 +27754,7 @@ let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_nt_ai"; +let CextOpcode = "V6_vL32b_nt"; let isCVLoadable = 1; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; @@ -27751,7 +27763,7 @@ def V6_vL32b_nt_cur_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32.cur = vmem($Rt32+#$Ii):nt", -tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_c0749f3c, TypeCVI_VM_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b001; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000010; @@ -27766,6 +27778,7 @@ let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_nt_cur_ai"; +let CextOpcode = "V6_vL32b_nt_cur"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; } @@ -27842,7 +27855,7 @@ def V6_vL32b_nt_cur_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32.cur = vmem($Rx32++#$Ii):nt", -tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b001; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001010; @@ -27857,6 +27870,7 @@ let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_nt_cur_pi"; +let CextOpcode = "V6_vL32b_nt_cur"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Rx32 = $Rx32in"; @@ -28019,7 +28033,7 @@ def V6_vL32b_nt_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32 = vmem($Rx32++#$Ii):nt", -tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001010; @@ -28033,6 +28047,7 @@ let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_nt_pi"; +let CextOpcode = "V6_vL32b_nt"; let isCVLoadable = 1; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; @@ -28127,7 +28142,7 @@ def V6_vL32b_nt_tmp_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32.tmp = vmem($Rt32+#$Ii):nt", -tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b010; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000010; @@ -28137,11 +28152,12 @@ let addrMode = BaseImmOffset; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_nt_tmp_ai"; +let CextOpcode = "V6_vL32b_nt_tmp"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; } @@ -28160,7 +28176,7 @@ let addrMode = BaseImmOffset; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; @@ -28183,7 +28199,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; @@ -28206,7 +28222,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; @@ -28218,7 +28234,7 @@ def V6_vL32b_nt_tmp_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32.tmp = vmem($Rx32++#$Ii):nt", -tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b010; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001010; @@ -28228,11 +28244,12 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_nt_tmp_pi"; +let CextOpcode = "V6_vL32b_nt_tmp"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Rx32 = $Rx32in"; @@ -28250,7 +28267,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; @@ -28273,7 +28290,7 @@ let addrMode = BaseImmOffset; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; @@ -28295,7 +28312,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; @@ -28317,7 +28334,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isNonTemporal = 1; let isRestrictNoSlot1Store = 1; @@ -28329,7 +28346,7 @@ def V6_vL32b_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32 = vmem($Rx32++#$Ii)", -tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_1ba8a0cd, TypeCVI_VM_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001000; @@ -28342,6 +28359,7 @@ let isCVI = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_pi"; +let CextOpcode = "V6_vL32b"; let isCVLoadable = 1; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; @@ -28432,7 +28450,7 @@ def V6_vL32b_tmp_ai : HInst< (outs HvxVR:$Vd32), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "$Vd32.tmp = vmem($Rt32+#$Ii)", -tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel { +tc_52447ecc, TypeCVI_VM_TMP_LD>, Enc_f3f408, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b010; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000000; @@ -28442,10 +28460,11 @@ let addrMode = BaseImmOffset; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_ai"; +let CextOpcode = "V6_vL32b_tmp"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; } @@ -28464,7 +28483,7 @@ let addrMode = BaseImmOffset; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_ai"; @@ -28486,7 +28505,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_pi"; @@ -28508,7 +28527,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_ppu"; @@ -28519,7 +28538,7 @@ def V6_vL32b_tmp_pi : HInst< (outs HvxVR:$Vd32, IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "$Vd32.tmp = vmem($Rx32++#$Ii)", -tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel { +tc_663c80a7, TypeCVI_VM_TMP_LD>, Enc_a255dc, Requires<[UseHVXV60]>, PredRel, PostInc_BaseImm { let Inst{7-5} = 0b010; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001000; @@ -28529,10 +28548,11 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_pi"; +let CextOpcode = "V6_vL32b_tmp"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Rx32 = $Rx32in"; @@ -28550,7 +28570,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_ppu"; @@ -28572,7 +28592,7 @@ let addrMode = BaseImmOffset; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_ai"; @@ -28593,7 +28613,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_pi"; @@ -28614,7 +28634,7 @@ let addrMode = PostInc; let accessSize = HVXVectorAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; +let hasHvxTmp = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; let BaseOpcode = "V6_vL32b_tmp_ppu"; @@ -28625,7 +28645,7 @@ def V6_vS32Ub_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "vmemu($Rt32+#$Ii) = $Vs32", -tc_f21e8abb, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { +tc_f21e8abb, TypeCVI_VM_STU>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-5} = 0b111; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000001; @@ -28634,6 +28654,7 @@ let accessSize = HVXVectorAccess; let isCVI = 1; let mayStore = 1; let BaseOpcode = "V6_vS32Ub_ai"; +let CextOpcode = "V6_vS32Ub"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; } @@ -28692,7 +28713,7 @@ def V6_vS32Ub_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "vmemu($Rx32++#$Ii) = $Vs32", -tc_e2d2e9e5, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { +tc_e2d2e9e5, TypeCVI_VM_STU>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-5} = 0b111; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001001; @@ -28701,6 +28722,7 @@ let accessSize = HVXVectorAccess; let isCVI = 1; let mayStore = 1; let BaseOpcode = "V6_vS32Ub_pi"; +let CextOpcode = "V6_vS32Ub"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Rx32 = $Rx32in"; @@ -28773,7 +28795,7 @@ def V6_vS32b_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "vmem($Rt32+#$Ii) = $Vs32", -tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { +tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-5} = 0b000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000001; @@ -28782,6 +28804,7 @@ let accessSize = HVXVectorAccess; let isCVI = 1; let mayStore = 1; let BaseOpcode = "V6_vS32b_ai"; +let CextOpcode = "V6_vS32b"; let isNVStorable = 1; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; @@ -28790,7 +28813,7 @@ def V6_vS32b_new_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8), "vmem($Rt32+#$Ii) = $Os8.new", -tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel { +tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-3} = 0b00100; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000001; @@ -28802,6 +28825,7 @@ let CVINew = 1; let isNewValue = 1; let mayStore = 1; let BaseOpcode = "V6_vS32b_ai"; +let CextOpcode = "V6_vS32b_new"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let opNewValue = 2; @@ -28873,7 +28897,7 @@ def V6_vS32b_new_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8), "vmem($Rx32++#$Ii) = $Os8.new", -tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel { +tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-3} = 0b00100; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001001; @@ -28885,6 +28909,7 @@ let CVINew = 1; let isNewValue = 1; let mayStore = 1; let BaseOpcode = "V6_vS32b_pi"; +let CextOpcode = "V6_vS32b_new"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let opNewValue = 3; @@ -29070,7 +29095,7 @@ def V6_vS32b_nt_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Vs32), "vmem($Rt32+#$Ii):nt = $Vs32", -tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel { +tc_c5dba46e, TypeCVI_VM_ST>, Enc_c9e3bc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-5} = 0b000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000011; @@ -29080,6 +29105,7 @@ let isCVI = 1; let isNonTemporal = 1; let mayStore = 1; let BaseOpcode = "V6_vS32b_ai"; +let CextOpcode = "V6_vS32b_nt"; let isNVStorable = 1; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; @@ -29088,7 +29114,7 @@ def V6_vS32b_nt_new_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii, HvxVR:$Os8), "vmem($Rt32+#$Ii):nt = $Os8.new", -tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel { +tc_ab23f776, TypeCVI_VM_NEW_ST>, Enc_f77fbc, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-3} = 0b00100; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101000011; @@ -29101,6 +29127,7 @@ let isNewValue = 1; let isNonTemporal = 1; let mayStore = 1; let BaseOpcode = "V6_vS32b_ai"; +let CextOpcode = "V6_vS32b_nt_new"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let opNewValue = 2; @@ -29175,7 +29202,7 @@ def V6_vS32b_nt_new_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Os8), "vmem($Rx32++#$Ii):nt = $Os8.new", -tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel { +tc_6942b6e0, TypeCVI_VM_NEW_ST>, Enc_1aaec1, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-3} = 0b00100; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001011; @@ -29188,6 +29215,7 @@ let isNewValue = 1; let isNonTemporal = 1; let mayStore = 1; let BaseOpcode = "V6_vS32b_pi"; +let CextOpcode = "V6_vS32b_nt_new"; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; let opNewValue = 3; @@ -29383,7 +29411,7 @@ def V6_vS32b_nt_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "vmem($Rx32++#$Ii):nt = $Vs32", -tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { +tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-5} = 0b000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001011; @@ -29393,6 +29421,7 @@ let isCVI = 1; let isNonTemporal = 1; let mayStore = 1; let BaseOpcode = "V6_vS32b_pi"; +let CextOpcode = "V6_vS32b_nt"; let isNVStorable = 1; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; @@ -29519,7 +29548,7 @@ def V6_vS32b_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii, HvxVR:$Vs32), "vmem($Rx32++#$Ii) = $Vs32", -tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel { +tc_3e2aaafc, TypeCVI_VM_ST>, Enc_b62ef7, Requires<[UseHVXV60]>, NewValueRel, PostInc_BaseImm { let Inst{7-5} = 0b000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101001001; @@ -29528,6 +29557,7 @@ let accessSize = HVXVectorAccess; let isCVI = 1; let mayStore = 1; let BaseOpcode = "V6_vS32b_pi"; +let CextOpcode = "V6_vS32b"; let isNVStorable = 1; let isPredicable = 1; let DecoderNamespace = "EXT_mmvec"; @@ -29689,6 +29719,32 @@ let mayStore = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Rx32 = $Rx32in"; } +def V6_vabs_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.hf = vabs($Vu32.hf)", +tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vabs_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.sf = vabs($Vu32.sf)", +tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vabsb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32), @@ -29975,6 +30031,123 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vadd_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vadd($Vu32.hf,$Vv32.hf)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vadd_hf_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.hf = vadd($Vu32.hf,$Vv32.hf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vadd_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vadd($Vu32.qf16,$Vv32.qf16)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vadd_qf16_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vadd($Vu32.qf16,$Vv32.hf)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vadd_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vadd($Vu32.qf32,$Vv32.qf32)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vadd_qf32_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vadd($Vu32.qf32,$Vv32.sf)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vadd_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vadd($Vu32.sf,$Vv32.sf)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vadd_sf_hf : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vdd32.sf = vadd($Vu32.hf,$Vv32.hf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vadd_sf_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.sf = vadd($Vu32.sf,$Vv32.sf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vaddb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -31440,6 +31613,58 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vasrvuhubrndsat : HInst< +(outs HvxVR:$Vd32), +(ins HvxWR:$Vuu32, HvxVR:$Vv32), +"$Vd32.ub = vasr($Vuu32.uh,$Vv32.ub):rnd:sat", +tc_05ca8cfd, TypeCVI_VS>, Enc_de5ea0, Requires<[UseHVXV69]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011101000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vasrvuhubsat : HInst< +(outs HvxVR:$Vd32), +(ins HvxWR:$Vuu32, HvxVR:$Vv32), +"$Vd32.ub = vasr($Vuu32.uh,$Vv32.ub):sat", +tc_05ca8cfd, TypeCVI_VS>, Enc_de5ea0, Requires<[UseHVXV69]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011101000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vasrvwuhrndsat : HInst< +(outs HvxVR:$Vd32), +(ins HvxWR:$Vuu32, HvxVR:$Vv32), +"$Vd32.uh = vasr($Vuu32.w,$Vv32.uh):rnd:sat", +tc_05ca8cfd, TypeCVI_VS>, Enc_de5ea0, Requires<[UseHVXV69]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011101000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vasrvwuhsat : HInst< +(outs HvxVR:$Vd32), +(ins HvxWR:$Vuu32, HvxVR:$Vv32), +"$Vd32.uh = vasr($Vuu32.w,$Vv32.uh):sat", +tc_05ca8cfd, TypeCVI_VS>, Enc_de5ea0, Requires<[UseHVXV69]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011101000; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vasrw : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), @@ -31597,6 +31822,33 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vassign_fp : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.w = vfmv($Vu32.w)", +tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vassign_tmp : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.tmp = $Vu32", +tc_2120355e, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV69]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b0; +let Inst{31-16} = 0b0001111000000001; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let hasHvxTmp = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vassignp : HInst< (outs HvxWR:$Vdd32), (ins HvxWR:$Vuu32), @@ -32000,6 +32252,189 @@ let isCVI = 1; let isRegSequence = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vcombine_tmp : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vdd32.tmp = vcombine($Vu32,$Vv32)", +tc_aa047364, TypeCVI_VX>, Enc_71bb9b, Requires<[UseHVXV69]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b0; +let Inst{31-21} = 0b00011110101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let hasHvxTmp = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_hf_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.hf = $Vu32.qf16", +tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_hf_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxWR:$Vuu32), +"$Vd32.hf = $Vuu32.qf32", +tc_51d0ecc3, TypeCVI_VS>, Enc_a33d04, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vconv_sf_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.sf = $Vu32.qf32", +tc_51d0ecc3, TypeCVI_VS>, Enc_e7581c, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_b_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.b = vcvt($Vu32.hf,$Vv32.hf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_h_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.h = vcvt($Vu32.hf)", +tc_3c8c15d0, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_hf_b : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32), +"$Vdd32.hf = vcvt($Vu32.b)", +tc_0afc8be9, TypeCVI_VX_DV>, Enc_dd766a, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_hf_h : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.hf = vcvt($Vu32.h)", +tc_3c8c15d0, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_hf_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.hf = vcvt($Vu32.sf,$Vv32.sf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_hf_ub : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32), +"$Vdd32.hf = vcvt($Vu32.ub)", +tc_0afc8be9, TypeCVI_VX_DV>, Enc_dd766a, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_hf_uh : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.hf = vcvt($Vu32.uh)", +tc_3c8c15d0, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_sf_hf : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32), +"$Vdd32.sf = vcvt($Vu32.hf)", +tc_0afc8be9, TypeCVI_VX_DV>, Enc_dd766a, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_ub_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.ub = vcvt($Vu32.hf,$Vv32.hf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vcvt_uh_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.uh = vcvt($Vu32.hf)", +tc_3c8c15d0, TypeCVI_VX>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vd0 : HInst< (outs HvxVR:$Vd32), (ins), @@ -32141,6 +32576,34 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vdmpy_sf_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.sf = vdmpy($Vu32.hf,$Vv32.hf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vdmpy_sf_hf_acc : HInst< +(outs HvxVR:$Vx32), +(ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Vx32.sf += vdmpy($Vu32.hf,$Vv32.hf)", +tc_a19b9305, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100010; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vx32 = $Vx32in"; +} def V6_vdmpybus : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), @@ -32415,7 +32878,7 @@ def V6_vdmpyhsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat", -tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_dcca380f, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -32428,7 +32891,7 @@ def V6_vdmpyhsat_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat", -tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_72e2b393, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -32523,7 +32986,7 @@ def V6_vdmpyhsusat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat", -tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_dcca380f, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001001; @@ -32536,7 +32999,7 @@ def V6_vdmpyhsusat_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, IntRegs:$Rt32), "$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat", -tc_660769f1, TypeCVI_VX_DV>, Enc_5138b3, Requires<[UseHVXV60]> { +tc_72e2b393, TypeCVI_VX>, Enc_5138b3, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011001001; @@ -32577,7 +33040,7 @@ def V6_vdmpyhvsat : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat", -tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_73efe966, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b011; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100000; @@ -32831,6 +33294,84 @@ let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Qx4 = $Qx4in"; } +def V6_vfmax_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.hf = vfmax($Vu32.hf,$Vv32.hf)", +tc_cda936da, TypeCVI_VX_LATE>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vfmax_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.sf = vfmax($Vu32.sf,$Vv32.sf)", +tc_cda936da, TypeCVI_VX_LATE>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vfmin_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.hf = vfmin($Vu32.hf,$Vv32.hf)", +tc_cda936da, TypeCVI_VX_LATE>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vfmin_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.sf = vfmin($Vu32.sf,$Vv32.sf)", +tc_cda936da, TypeCVI_VX_LATE>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vfneg_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.hf = vfneg($Vu32.hf)", +tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vfneg_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32), +"$Vd32.sf = vfneg($Vu32.sf)", +tc_5cdf8c84, TypeCVI_VX_LATE>, Enc_e7581c, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-16} = 0b0001111000000110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vgathermh : HInst< (outs), (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32), @@ -32843,7 +33384,6 @@ let opNewValue = 0; let accessSize = HalfWordAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; let mayLoad = 1; let Defs = [VTMP]; let DecoderNamespace = "EXT_mmvec"; @@ -32860,7 +33400,6 @@ let opNewValue = 0; let accessSize = HalfWordAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; let mayLoad = 1; let Defs = [VTMP]; let DecoderNamespace = "EXT_mmvec"; @@ -32877,7 +33416,6 @@ let opNewValue = 0; let accessSize = HalfWordAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; let mayLoad = 1; let Defs = [VTMP]; let DecoderNamespace = "EXT_mmvec"; @@ -32894,7 +33432,6 @@ let opNewValue = 0; let accessSize = HalfWordAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; let mayLoad = 1; let Defs = [VTMP]; let DecoderNamespace = "EXT_mmvec"; @@ -32911,7 +33448,6 @@ let opNewValue = 0; let accessSize = WordAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; let mayLoad = 1; let Defs = [VTMP]; let DecoderNamespace = "EXT_mmvec"; @@ -32928,7 +33464,6 @@ let opNewValue = 0; let accessSize = WordAccess; let isCVLoad = 1; let isCVI = 1; -let hasTmpDst = 1; let mayLoad = 1; let Defs = [VTMP]; let DecoderNamespace = "EXT_mmvec"; @@ -33033,6 +33568,106 @@ let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Qx4 = $Qx4in"; } +def V6_vgthf : HInst< +(outs HvxQR:$Qd4), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Qd4 = vcmp.gt($Vu32.hf,$Vv32.hf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV68,UseHVXFloatingPoint]> { +let Inst{7-2} = 0b011101; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vgthf_and : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 &= vcmp.gt($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> { +let Inst{7-2} = 0b110011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_vgthf_or : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 |= vcmp.gt($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> { +let Inst{7-2} = 0b001101; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isAccumulator = 1; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_vgthf_xor : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 ^= vcmp.gt($Vu32.hf,$Vv32.hf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> { +let Inst{7-2} = 0b111011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_vgtsf : HInst< +(outs HvxQR:$Qd4), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Qd4 = vcmp.gt($Vu32.sf,$Vv32.sf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_95441f, Requires<[UseHVXV68,UseHVXFloatingPoint]> { +let Inst{7-2} = 0b011100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vgtsf_and : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 &= vcmp.gt($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> { +let Inst{7-2} = 0b110010; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_vgtsf_or : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 |= vcmp.gt($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> { +let Inst{7-2} = 0b001100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isAccumulator = 1; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} +def V6_vgtsf_xor : HInst< +(outs HvxQR:$Qx4), +(ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Qx4 ^= vcmp.gt($Vu32.sf,$Vv32.sf)", +tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV68,UseHVXFloatingPoint]> { +let Inst{7-2} = 0b111010; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100100; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Qx4 = $Qx4in"; +} def V6_vgtub : HInst< (outs HvxQR:$Qd4), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -33552,6 +34187,32 @@ let opNewValue = 0; let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vmax_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.hf = vmax($Vu32.hf,$Vv32.hf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmax_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.sf = vmax($Vu32.sf,$Vv32.sf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vmaxb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -33677,6 +34338,32 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vmin_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.hf = vmin($Vu32.hf,$Vv32.hf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmin_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.sf = vmin($Vu32.sf,$Vv32.sf)", +tc_56c4f9fe, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vminb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -34110,6 +34797,179 @@ let isCVI = 1; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Vx32 = $Vx32in"; } +def V6_vmpy_hf_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.hf = vmpy($Vu32.hf,$Vv32.hf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_hf_hf_acc : HInst< +(outs HvxVR:$Vx32), +(ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Vx32.hf += vmpy($Vu32.hf,$Vv32.hf)", +tc_a19b9305, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100010; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vx32 = $Vx32in"; +} +def V6_vmpy_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vmpy($Vu32.qf16,$Vv32.qf16)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111111; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_qf16_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vmpy($Vu32.hf,$Vv32.hf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111111; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_qf16_mix_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vmpy($Vu32.qf16,$Vv32.hf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111111; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vmpy($Vu32.qf32,$Vv32.qf32)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111111; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_qf32_hf : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vdd32.qf32 = vmpy($Vu32.hf,$Vv32.hf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111111; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_qf32_mix_hf : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vdd32.qf32 = vmpy($Vu32.qf16,$Vv32.hf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_qf32_qf16 : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vdd32.qf32 = vmpy($Vu32.qf16,$Vv32.qf16)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111111; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_qf32_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vmpy($Vu32.sf,$Vv32.sf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111111; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_sf_hf : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vdd32.sf = vmpy($Vu32.hf,$Vv32.hf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b010; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vmpy_sf_hf_acc : HInst< +(outs HvxWR:$Vxx32), +(ins HvxWR:$Vxx32in, HvxVR:$Vu32, HvxVR:$Vv32), +"$Vxx32.sf += vmpy($Vu32.hf,$Vv32.hf)", +tc_08a4f1b6, TypeCVI_VX_DV>, Enc_3fc427, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011100010; +let hasNewValue = 1; +let opNewValue = 0; +let isAccumulator = 1; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +let Constraints = "$Vxx32 = $Vxx32in"; +} +def V6_vmpy_sf_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.sf = vmpy($Vu32.sf,$Vv32.sf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b001; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vmpybus : HInst< (outs HvxWR:$Vdd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), @@ -34397,7 +35257,7 @@ def V6_vmpyhsrs : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat", -tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_dcca380f, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001010; @@ -34422,7 +35282,7 @@ def V6_vmpyhss : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, IntRegs:$Rt32), "$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat", -tc_0b04c6c7, TypeCVI_VX_DV>, Enc_b087ac, Requires<[UseHVXV60]> { +tc_dcca380f, TypeCVI_VX>, Enc_b087ac, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011001010; @@ -34555,7 +35415,7 @@ def V6_vmpyhvsrs : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), "$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat", -tc_d8287c14, TypeCVI_VX_DV>, Enc_45364e, Requires<[UseHVXV60]> { +tc_73efe966, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b0; let Inst{31-21} = 0b00011100001; @@ -35332,6 +36192,19 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vmpyuhvs : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.uh = vmpy($Vu32.uh,$Vv32.uh):>>16", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV69]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111110; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vmux : HInst< (outs HvxVR:$Vd32), (ins HvxQR:$Qt4, HvxVR:$Vu32, HvxVR:$Vv32), @@ -36007,7 +36880,7 @@ def V6_vrmpybusv_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)", -tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_37820f4c, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b010; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -36061,7 +36934,7 @@ def V6_vrmpybv_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.w += vrmpy($Vu32.b,$Vv32.b)", -tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_37820f4c, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b001; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -36277,7 +37150,7 @@ def V6_vrmpyubv_acc : HInst< (outs HvxVR:$Vx32), (ins HvxVR:$Vx32in, HvxVR:$Vu32, HvxVR:$Vv32), "$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)", -tc_08a4f1b6, TypeCVI_VX_DV>, Enc_a7341a, Requires<[UseHVXV60]> { +tc_37820f4c, TypeCVI_VX>, Enc_a7341a, Requires<[UseHVXV60]> { let Inst{7-5} = 0b000; let Inst{13-13} = 0b1; let Inst{31-21} = 0b00011100000; @@ -37412,6 +38285,123 @@ let isPseudo = 1; let isCodeGenOnly = 1; let DecoderNamespace = "EXT_mmvec"; } +def V6_vsub_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vsub($Vu32.hf,$Vv32.hf)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b110; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vsub_hf_hf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.hf = vsub($Vu32.hf,$Vv32.hf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b000; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vsub_qf16 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vsub($Vu32.qf16,$Vv32.qf16)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vsub_qf16_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf16 = vsub($Vu32.qf16,$Vv32.hf)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111011; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vsub_qf32 : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vsub($Vu32.qf32,$Vv32.qf32)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b011; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vsub_qf32_mix : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vsub($Vu32.qf32,$Vv32.sf)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vsub_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.qf32 = vsub($Vu32.sf,$Vv32.sf)", +tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV68,UseHVXQFloat]> { +let Inst{7-5} = 0b100; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111101; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vsub_sf_hf : HInst< +(outs HvxWR:$Vdd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vdd32.sf = vsub($Vu32.hf,$Vv32.hf)", +tc_d8287c14, TypeCVI_VX_DV>, Enc_71bb9b, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b101; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} +def V6_vsub_sf_sf : HInst< +(outs HvxVR:$Vd32), +(ins HvxVR:$Vu32, HvxVR:$Vv32), +"$Vd32.sf = vsub($Vu32.sf,$Vv32.sf)", +tc_c127de3a, TypeCVI_VX>, Enc_45364e, Requires<[UseHVXV68,UseHVXIEEEFP]> { +let Inst{7-5} = 0b111; +let Inst{13-13} = 0b1; +let Inst{31-21} = 0b00011111100; +let hasNewValue = 1; +let opNewValue = 0; +let isCVI = 1; +let DecoderNamespace = "EXT_mmvec"; +} def V6_vsubb : HInst< (outs HvxVR:$Vd32), (ins HvxVR:$Vu32, HvxVR:$Vv32), @@ -38647,7 +39637,7 @@ def V6_zLd_ai : HInst< (outs), (ins IntRegs:$Rt32, s4_0Imm:$Ii), "z = vmem($Rt32+#$Ii)", -tc_e699ae41, TypeCVI_ZW>, Enc_ff3442, Requires<[UseHVXV66,UseZReg]> { +tc_e699ae41, TypeCVI_ZW>, Enc_ff3442, Requires<[UseHVXV66,UseZReg]>, PostInc_BaseImm { let Inst{7-0} = 0b00000000; let Inst{12-11} = 0b00; let Inst{31-21} = 0b00101100000; @@ -38655,13 +39645,14 @@ let addrMode = BaseImmOffset; let isCVI = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; +let CextOpcode = "V6_zLd"; let DecoderNamespace = "EXT_mmvec"; } def V6_zLd_pi : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, s3_0Imm:$Ii), "z = vmem($Rx32++#$Ii)", -tc_a0dbea28, TypeCVI_ZW>, Enc_6c9ee0, Requires<[UseHVXV66,UseZReg]> { +tc_a0dbea28, TypeCVI_ZW>, Enc_6c9ee0, Requires<[UseHVXV66,UseZReg]>, PostInc_BaseImm { let Inst{7-0} = 0b00000000; let Inst{13-11} = 0b000; let Inst{31-21} = 0b00101101000; @@ -38669,6 +39660,7 @@ let addrMode = PostInc; let isCVI = 1; let mayLoad = 1; let isRestrictNoSlot1Store = 1; +let CextOpcode = "V6_zLd"; let DecoderNamespace = "EXT_mmvec"; let Constraints = "$Rx32 = $Rx32in"; } @@ -38782,6 +39774,17 @@ let Inst{13-0} = 0b00000000000000; let Inst{31-16} = 0b0110110000100000; let isSolo = 1; } +def Y2_crswap_old : HInst< +(outs IntRegs:$Rx32), +(ins IntRegs:$Rx32in), +"crswap($Rx32,sgp)", +PSEUDO, TypeMAPPING> { +let hasNewValue = 1; +let opNewValue = 0; +let isPseudo = 1; +let isCodeGenOnly = 1; +let Constraints = "$Rx32 = $Rx32in"; +} def Y2_dccleana : HInst< (outs), (ins IntRegs:$Rs32), @@ -38861,6 +39864,22 @@ let Inst{13-0} = 0b00000000000010; let Inst{31-16} = 0b0101011111000000; let isSolo = 1; } +def Y2_k1lock_map : HInst< +(outs), +(ins), +"k1lock", +PSEUDO, TypeMAPPING>, Requires<[HasV65]> { +let isPseudo = 1; +let isCodeGenOnly = 1; +} +def Y2_k1unlock_map : HInst< +(outs), +(ins), +"k1unlock", +PSEUDO, TypeMAPPING>, Requires<[HasV65]> { +let isPseudo = 1; +let isCodeGenOnly = 1; +} def Y2_syncht : HInst< (outs), (ins), @@ -39083,7 +40102,7 @@ def dup_A2_add : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, IntRegs:$Rt32), "$Rd32 = add($Rs32,$Rt32)", -tc_388f9897, TypeALU32_3op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_3op>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39093,7 +40112,7 @@ def dup_A2_addi : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = add($Rs32,#$Ii)", -tc_388f9897, TypeALU32_ADDI>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_ADDI>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39108,7 +40127,7 @@ def dup_A2_andir : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = and($Rs32,#$Ii)", -tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39123,7 +40142,7 @@ def dup_A2_combineii : HInst< (outs DoubleRegs:$Rdd32), (ins s32_0Imm:$Ii, s8_0Imm:$II), "$Rdd32 = combine(#$Ii,#$II)", -tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> { let AsmVariantName = "NonParsable"; let isPseudo = 1; let isExtendable = 1; @@ -39136,7 +40155,7 @@ def dup_A2_sxtb : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = sxtb($Rs32)", -tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> { +tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39146,7 +40165,7 @@ def dup_A2_sxth : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = sxth($Rs32)", -tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> { +tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39156,7 +40175,7 @@ def dup_A2_tfr : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = $Rs32", -tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> { +tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39166,7 +40185,7 @@ def dup_A2_tfrsi : HInst< (outs IntRegs:$Rd32), (ins s32_0Imm:$Ii), "$Rd32 = #$Ii", -tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> { +tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39181,7 +40200,7 @@ def dup_A2_zxtb : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = zxtb($Rs32)", -PSEUDO, TypeMAPPING>, Requires<[HasV68]> { +PSEUDO, TypeMAPPING>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39191,7 +40210,7 @@ def dup_A2_zxth : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32), "$Rd32 = zxth($Rs32)", -tc_9124c04f, TypeALU32_2op>, Requires<[HasV68]> { +tc_9124c04f, TypeALU32_2op>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let AsmVariantName = "NonParsable"; @@ -39201,7 +40220,7 @@ def dup_A4_combineii : HInst< (outs DoubleRegs:$Rdd32), (ins s8_0Imm:$Ii, u32_0Imm:$II), "$Rdd32 = combine(#$Ii,#$II)", -tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> { let AsmVariantName = "NonParsable"; let isPseudo = 1; let isExtendable = 1; @@ -39214,7 +40233,7 @@ def dup_A4_combineir : HInst< (outs DoubleRegs:$Rdd32), (ins s32_0Imm:$Ii, IntRegs:$Rs32), "$Rdd32 = combine(#$Ii,$Rs32)", -tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> { let AsmVariantName = "NonParsable"; let isPseudo = 1; let isExtendable = 1; @@ -39227,7 +40246,7 @@ def dup_A4_combineri : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rdd32 = combine($Rs32,#$Ii)", -tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> { let AsmVariantName = "NonParsable"; let isPseudo = 1; let isExtendable = 1; @@ -39240,7 +40259,7 @@ def dup_C2_cmoveif : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii), "if (!$Pu4) $Rd32 = #$Ii", -tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> { let isPredicated = 1; let isPredicatedFalse = 1; let hasNewValue = 1; @@ -39257,7 +40276,7 @@ def dup_C2_cmoveit : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii), "if ($Pu4) $Rd32 = #$Ii", -tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> { let isPredicated = 1; let hasNewValue = 1; let opNewValue = 0; @@ -39273,7 +40292,7 @@ def dup_C2_cmovenewif : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii), "if (!$Pu4.new) $Rd32 = #$Ii", -tc_4ac61d92, TypeALU32_2op>, Requires<[HasV68]> { +tc_4ac61d92, TypeALU32_2op>, Requires<[HasV69]> { let isPredicated = 1; let isPredicatedFalse = 1; let hasNewValue = 1; @@ -39291,7 +40310,7 @@ def dup_C2_cmovenewit : HInst< (outs IntRegs:$Rd32), (ins PredRegs:$Pu4, s32_0Imm:$Ii), "if ($Pu4.new) $Rd32 = #$Ii", -tc_4ac61d92, TypeALU32_2op>, Requires<[HasV68]> { +tc_4ac61d92, TypeALU32_2op>, Requires<[HasV69]> { let isPredicated = 1; let hasNewValue = 1; let opNewValue = 0; @@ -39308,7 +40327,7 @@ def dup_C2_cmpeqi : HInst< (outs PredRegs:$Pd4), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Pd4 = cmp.eq($Rs32,#$Ii)", -tc_388f9897, TypeALU32_2op>, Requires<[HasV68]> { +tc_388f9897, TypeALU32_2op>, Requires<[HasV69]> { let AsmVariantName = "NonParsable"; let isPseudo = 1; let isExtendable = 1; @@ -39321,7 +40340,7 @@ def dup_L2_deallocframe : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32), "$Rdd32 = deallocframe($Rs32):raw", -tc_aee6250c, TypeLD>, Requires<[HasV68]> { +tc_aee6250c, TypeLD>, Requires<[HasV69]> { let accessSize = DoubleWordAccess; let AsmVariantName = "NonParsable"; let mayLoad = 1; @@ -39333,7 +40352,7 @@ def dup_L2_loadrb_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = memb($Rs32+#$Ii)", -tc_eed07714, TypeLD>, Requires<[HasV68]> { +tc_eed07714, TypeLD>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let addrMode = BaseImmOffset; @@ -39351,7 +40370,7 @@ def dup_L2_loadrd_io : HInst< (outs DoubleRegs:$Rdd32), (ins IntRegs:$Rs32, s29_3Imm:$Ii), "$Rdd32 = memd($Rs32+#$Ii)", -tc_eed07714, TypeLD>, Requires<[HasV68]> { +tc_eed07714, TypeLD>, Requires<[HasV69]> { let addrMode = BaseImmOffset; let accessSize = DoubleWordAccess; let AsmVariantName = "NonParsable"; @@ -39367,7 +40386,7 @@ def dup_L2_loadrh_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s31_1Imm:$Ii), "$Rd32 = memh($Rs32+#$Ii)", -tc_eed07714, TypeLD>, Requires<[HasV68]> { +tc_eed07714, TypeLD>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let addrMode = BaseImmOffset; @@ -39385,7 +40404,7 @@ def dup_L2_loadri_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s30_2Imm:$Ii), "$Rd32 = memw($Rs32+#$Ii)", -tc_eed07714, TypeLD>, Requires<[HasV68]> { +tc_eed07714, TypeLD>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let addrMode = BaseImmOffset; @@ -39403,7 +40422,7 @@ def dup_L2_loadrub_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s32_0Imm:$Ii), "$Rd32 = memub($Rs32+#$Ii)", -tc_eed07714, TypeLD>, Requires<[HasV68]> { +tc_eed07714, TypeLD>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let addrMode = BaseImmOffset; @@ -39421,7 +40440,7 @@ def dup_L2_loadruh_io : HInst< (outs IntRegs:$Rd32), (ins IntRegs:$Rs32, s31_1Imm:$Ii), "$Rd32 = memuh($Rs32+#$Ii)", -tc_eed07714, TypeLD>, Requires<[HasV68]> { +tc_eed07714, TypeLD>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let addrMode = BaseImmOffset; @@ -39439,7 +40458,7 @@ def dup_S2_allocframe : HInst< (outs IntRegs:$Rx32), (ins IntRegs:$Rx32in, u11_3Imm:$Ii), "allocframe($Rx32,#$Ii):raw", -tc_74a42bda, TypeST>, Requires<[HasV68]> { +tc_74a42bda, TypeST>, Requires<[HasV69]> { let hasNewValue = 1; let opNewValue = 0; let addrMode = BaseImmOffset; @@ -39455,7 +40474,7 @@ def dup_S2_storerb_io : HInst< (outs), (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32), "memb($Rs32+#$Ii) = $Rt32", -tc_a9edeffa, TypeST>, Requires<[HasV68]> { +tc_a9edeffa, TypeST>, Requires<[HasV69]> { let addrMode = BaseImmOffset; let accessSize = ByteAccess; let AsmVariantName = "NonParsable"; @@ -39471,7 +40490,7 @@ def dup_S2_storerd_io : HInst< (outs), (ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32), "memd($Rs32+#$Ii) = $Rtt32", -tc_a9edeffa, TypeST>, Requires<[HasV68]> { +tc_a9edeffa, TypeST>, Requires<[HasV69]> { let addrMode = BaseImmOffset; let accessSize = DoubleWordAccess; let AsmVariantName = "NonParsable"; @@ -39487,7 +40506,7 @@ def dup_S2_storerh_io : HInst< (outs), (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32), "memh($Rs32+#$Ii) = $Rt32", -tc_a9edeffa, TypeST>, Requires<[HasV68]> { +tc_a9edeffa, TypeST>, Requires<[HasV69]> { let addrMode = BaseImmOffset; let accessSize = HalfWordAccess; let AsmVariantName = "NonParsable"; @@ -39503,7 +40522,7 @@ def dup_S2_storeri_io : HInst< (outs), (ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32), "memw($Rs32+#$Ii) = $Rt32", -tc_a9edeffa, TypeST>, Requires<[HasV68]> { +tc_a9edeffa, TypeST>, Requires<[HasV69]> { let addrMode = BaseImmOffset; let accessSize = WordAccess; let AsmVariantName = "NonParsable"; @@ -39519,7 +40538,7 @@ def dup_S4_storeirb_io : HInst< (outs), (ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II), "memb($Rs32+#$Ii) = #$II", -tc_838c4d7a, TypeV4LDST>, Requires<[HasV68]> { +tc_838c4d7a, TypeV4LDST>, Requires<[HasV69]> { let addrMode = BaseImmOffset; let accessSize = ByteAccess; let AsmVariantName = "NonParsable"; @@ -39535,7 +40554,7 @@ def dup_S4_storeiri_io : HInst< (outs), (ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II), "memw($Rs32+#$Ii) = #$II", -tc_838c4d7a, TypeV4LDST>, Requires<[HasV68]> { +tc_838c4d7a, TypeV4LDST>, Requires<[HasV69]> { let addrMode = BaseImmOffset; let accessSize = WordAccess; let AsmVariantName = "NonParsable"; diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td index e5c78d122c9e..64bc5091d1d1 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td +++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td @@ -1661,8 +1661,6 @@ def: Pat<(int_hexagon_Y2_dccleana IntRegs:$src1), (Y2_dccleana IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_Y2_dccleaninva IntRegs:$src1), (Y2_dccleaninva IntRegs:$src1)>, Requires<[HasV5]>; -def: Pat<(int_hexagon_Y2_dcfetch IntRegs:$src1), - (Y2_dcfetch IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_Y2_dcinva IntRegs:$src1), (Y2_dcinva IntRegs:$src1)>, Requires<[HasV5]>; def: Pat<(int_hexagon_Y2_dczeroa IntRegs:$src1), @@ -3380,3 +3378,294 @@ def: Pat<(int_hexagon_V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, (V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV68, UseHVX64B]>; def: Pat<(int_hexagon_V6_v6mpyvubs10_vxx_128B HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4), (V6_v6mpyvubs10_vxx HvxWR:$src1, HvxWR:$src2, HvxWR:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabs_hf HvxVR:$src1), + (V6_vabs_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabs_hf_128B HvxVR:$src1), + (V6_vabs_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vabs_sf HvxVR:$src1), + (V6_vabs_sf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vabs_sf_128B HvxVR:$src1), + (V6_vabs_sf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadd_hf HvxVR:$src1, HvxVR:$src2), + (V6_vadd_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_hf_hf HvxVR:$src1, HvxVR:$src2), + (V6_vadd_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadd_hf_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadd_qf16 HvxVR:$src1, HvxVR:$src2), + (V6_vadd_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_qf16_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_qf16_mix HvxVR:$src1, HvxVR:$src2), + (V6_vadd_qf16_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_qf16_mix_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_qf16_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_qf32 HvxVR:$src1, HvxVR:$src2), + (V6_vadd_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_qf32_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_qf32_mix HvxVR:$src1, HvxVR:$src2), + (V6_vadd_qf32_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_qf32_mix_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_qf32_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_sf HvxVR:$src1, HvxVR:$src2), + (V6_vadd_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vadd_sf_hf HvxVR:$src1, HvxVR:$src2), + (V6_vadd_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadd_sf_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vadd_sf_sf HvxVR:$src1, HvxVR:$src2), + (V6_vadd_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vadd_sf_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vadd_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vassign_fp HvxVR:$src1), + (V6_vassign_fp HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vassign_fp_128B HvxVR:$src1), + (V6_vassign_fp HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vconv_hf_qf16 HvxVR:$src1), + (V6_vconv_hf_qf16 HvxVR:$src1)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_hf_qf16_128B HvxVR:$src1), + (V6_vconv_hf_qf16 HvxVR:$src1)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_hf_qf32 HvxWR:$src1), + (V6_vconv_hf_qf32 HvxWR:$src1)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_hf_qf32_128B HvxWR:$src1), + (V6_vconv_hf_qf32 HvxWR:$src1)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_sf_qf32 HvxVR:$src1), + (V6_vconv_sf_qf32 HvxVR:$src1)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vconv_sf_qf32_128B HvxVR:$src1), + (V6_vconv_sf_qf32 HvxVR:$src1)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vcvt_b_hf HvxVR:$src1, HvxVR:$src2), + (V6_vcvt_b_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_b_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vcvt_b_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_h_hf HvxVR:$src1), + (V6_vcvt_h_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_h_hf_128B HvxVR:$src1), + (V6_vcvt_h_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_b HvxVR:$src1), + (V6_vcvt_hf_b HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_b_128B HvxVR:$src1), + (V6_vcvt_hf_b HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_h HvxVR:$src1), + (V6_vcvt_hf_h HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_h_128B HvxVR:$src1), + (V6_vcvt_hf_h HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_sf HvxVR:$src1, HvxVR:$src2), + (V6_vcvt_hf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vcvt_hf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_ub HvxVR:$src1), + (V6_vcvt_hf_ub HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_ub_128B HvxVR:$src1), + (V6_vcvt_hf_ub HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_uh HvxVR:$src1), + (V6_vcvt_hf_uh HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_hf_uh_128B HvxVR:$src1), + (V6_vcvt_hf_uh HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_sf_hf HvxVR:$src1), + (V6_vcvt_sf_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_sf_hf_128B HvxVR:$src1), + (V6_vcvt_sf_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_ub_hf HvxVR:$src1, HvxVR:$src2), + (V6_vcvt_ub_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_ub_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vcvt_ub_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vcvt_uh_hf HvxVR:$src1), + (V6_vcvt_uh_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vcvt_uh_hf_128B HvxVR:$src1), + (V6_vcvt_uh_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpy_sf_hf HvxVR:$src1, HvxVR:$src2), + (V6_vdmpy_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpy_sf_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vdmpy_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vdmpy_sf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vdmpy_sf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vdmpy_sf_hf_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vdmpy_sf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vfmax_hf HvxVR:$src1, HvxVR:$src2), + (V6_vfmax_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vfmax_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vfmax_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vfmax_sf HvxVR:$src1, HvxVR:$src2), + (V6_vfmax_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vfmax_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vfmax_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vfmin_hf HvxVR:$src1, HvxVR:$src2), + (V6_vfmin_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vfmin_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vfmin_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vfmin_sf HvxVR:$src1, HvxVR:$src2), + (V6_vfmin_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vfmin_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vfmin_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vfneg_hf HvxVR:$src1), + (V6_vfneg_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vfneg_hf_128B HvxVR:$src1), + (V6_vfneg_hf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vfneg_sf HvxVR:$src1), + (V6_vfneg_sf HvxVR:$src1)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vfneg_sf_128B HvxVR:$src1), + (V6_vfneg_sf HvxVR:$src1)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vgthf HvxVR:$src1, HvxVR:$src2), + (V6_vgthf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgthf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vgthf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgthf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgthf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgthf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgthf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgthf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgthf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgthf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgthf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgthf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgthf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgthf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgthf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgtsf HvxVR:$src1, HvxVR:$src2), + (V6_vgtsf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgtsf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vgtsf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgtsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgtsf_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtsf_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgtsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgtsf_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtsf_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgtsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vgtsf_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vgtsf_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmax_hf HvxVR:$src1, HvxVR:$src2), + (V6_vmax_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmax_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmax_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmax_sf HvxVR:$src1, HvxVR:$src2), + (V6_vmax_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmax_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmax_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmin_hf HvxVR:$src1, HvxVR:$src2), + (V6_vmin_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmin_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmin_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmin_sf HvxVR:$src1, HvxVR:$src2), + (V6_vmin_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmin_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmin_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_hf_hf HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpy_hf_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpy_hf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpy_hf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpy_hf_hf_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpy_hf_hf_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpy_qf16 HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf16_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf16_hf HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf16_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf16_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf16_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf16_mix_hf HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf16_mix_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf16_mix_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf16_mix_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32 HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_hf HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_mix_hf HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32_mix_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_mix_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32_mix_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_qf16 HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_qf16_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_sf HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_qf32_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_qf32_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vmpy_sf_hf HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpy_sf_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpy_sf_hf_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpy_sf_hf_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpy_sf_hf_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3), + (V6_vmpy_sf_hf_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpy_sf_sf HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpy_sf_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpy_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsub_hf HvxVR:$src1, HvxVR:$src2), + (V6_vsub_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_hf_hf HvxVR:$src1, HvxVR:$src2), + (V6_vsub_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsub_hf_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_hf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsub_qf16 HvxVR:$src1, HvxVR:$src2), + (V6_vsub_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_qf16_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_qf16 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_qf16_mix HvxVR:$src1, HvxVR:$src2), + (V6_vsub_qf16_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_qf16_mix_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_qf16_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_qf32 HvxVR:$src1, HvxVR:$src2), + (V6_vsub_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_qf32_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_qf32 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_qf32_mix HvxVR:$src1, HvxVR:$src2), + (V6_vsub_qf32_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_qf32_mix_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_qf32_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf HvxVR:$src1, HvxVR:$src2), + (V6_vsub_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B, UseHVXQFloat]>; +def: Pat<(int_hexagon_V6_vsub_sf_hf HvxVR:$src1, HvxVR:$src2), + (V6_vsub_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsub_sf_hf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_sf_hf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vsub_sf_sf HvxVR:$src1, HvxVR:$src2), + (V6_vsub_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vsub_sf_sf_128B HvxVR:$src1, HvxVR:$src2), + (V6_vsub_sf_sf HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV68, UseHVX128B]>; + +// V69 HVX Instructions. + +def: Pat<(int_hexagon_V6_vasrvuhubrndsat HvxWR:$src1, HvxVR:$src2), + (V6_vasrvuhubrndsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrvuhubrndsat_128B HvxWR:$src1, HvxVR:$src2), + (V6_vasrvuhubrndsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrvuhubsat HvxWR:$src1, HvxVR:$src2), + (V6_vasrvuhubsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrvuhubsat_128B HvxWR:$src1, HvxVR:$src2), + (V6_vasrvuhubsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrvwuhrndsat HvxWR:$src1, HvxVR:$src2), + (V6_vasrvwuhrndsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrvwuhrndsat_128B HvxWR:$src1, HvxVR:$src2), + (V6_vasrvwuhrndsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vasrvwuhsat HvxWR:$src1, HvxVR:$src2), + (V6_vasrvwuhsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vasrvwuhsat_128B HvxWR:$src1, HvxVR:$src2), + (V6_vasrvwuhsat HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>; +def: Pat<(int_hexagon_V6_vmpyuhvs HvxVR:$src1, HvxVR:$src2), + (V6_vmpyuhvs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX64B]>; +def: Pat<(int_hexagon_V6_vmpyuhvs_128B HvxVR:$src1, HvxVR:$src2), + (V6_vmpyuhvs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV69, UseHVX128B]>; diff --git a/llvm/lib/Target/Hexagon/HexagonDepMappings.td b/llvm/lib/Target/Hexagon/HexagonDepMappings.td index 919cb996ad15..2f7b76b893a9 100644 --- a/llvm/lib/Target/Hexagon/HexagonDepMappings.td +++ b/llvm/lib/Target/Hexagon/HexagonDepMappings.td @@ -174,7 +174,6 @@ def V6_ldcpnt0Alias : InstAlias<"if ($Pv4) $Vd32.cur = vmem($Rt32):nt", (V6_vL32 def V6_ldnp0Alias : InstAlias<"if (!$Pv4) $Vd32 = vmem($Rt32)", (V6_vL32b_npred_pi HvxVR:$Vd32, IntRegs:$Rt32, PredRegs:$Pv4, 0)>, Requires<[UseHVX]>; def V6_ldnpnt0Alias : InstAlias<"if (!$Pv4) $Vd32 = vmem($Rt32):nt", (V6_vL32b_nt_npred_pi HvxVR:$Vd32, IntRegs:$Rt32, PredRegs:$Pv4, 0)>, Requires<[UseHVX]>; def V6_ldnt0Alias : InstAlias<"$Vd32 = vmem($Rt32):nt", (V6_vL32b_nt_ai HvxVR:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>; -def V6_ldntnt0Alias : InstAlias<"$Vd32 = vmem($Rt32):nt", (V6_vL32b_nt_ai HvxVR:$Vd32, IntRegs:$Rt32, 0)>; def V6_ldp0Alias : InstAlias<"if ($Pv4) $Vd32 = vmem($Rt32)", (V6_vL32b_pred_ai HvxVR:$Vd32, PredRegs:$Pv4, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>; def V6_ldpnt0Alias : InstAlias<"if ($Pv4) $Vd32 = vmem($Rt32):nt", (V6_vL32b_nt_pred_ai HvxVR:$Vd32, PredRegs:$Pv4, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>; def V6_ldtnp0Alias : InstAlias<"if (!$Pv4) $Vd32.tmp = vmem($Rt32)", (V6_vL32b_npred_ai HvxVR:$Vd32, PredRegs:$Pv4, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>; diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index 46c1fbc6eeb2..85230cac9d7c 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -1445,8 +1445,8 @@ bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) { MachineBasicBlock *B = N->getBlock(); std::vector<MachineInstr*> Instrs; - for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) - Instrs.push_back(&*I); + for (MachineInstr &MI : llvm::reverse(*B)) + Instrs.push_back(&MI); for (MachineInstr *MI : Instrs) { unsigned Opc = MI->getOpcode(); diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp index e45126bec6ef..44679d429de5 100644 --- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp @@ -60,7 +60,7 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) { RetVal = NoHazard; LLVM_DEBUG(dbgs() << "*** Try .new version? " << (RetVal == NoHazard) << "\n"); - MF->DeleteMachineInstr(NewMI); + MF->deleteMachineInstr(NewMI); } return RetVal; } @@ -129,7 +129,7 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) { MI->getDebugLoc()); assert(Resources->canReserveResources(*NewMI)); Resources->reserveResources(*NewMI); - MF->DeleteMachineInstr(NewMI); + MF->deleteMachineInstr(NewMI); } else Resources->reserveResources(*MI); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td index 45adaf50774f..898ef51bd48f 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td +++ b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td @@ -146,9 +146,6 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, bits<1> isFP = 0; let TSFlags {50} = isFP; // Floating-point. - bits<1> isSomeOK = 0; - let TSFlags {51} = isSomeOK; // Relax some grouping constraints. - bits<1> hasNewValue2 = 0; let TSFlags{52} = hasNewValue2; // Second New-value producer insn. bits<3> opNewValue2 = 0; @@ -160,8 +157,8 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, bits<1> prefersSlot3 = 0; let TSFlags{57} = prefersSlot3; // Complex XU - bits<1> hasTmpDst = 0; - let TSFlags{60} = hasTmpDst; // v65 : 'fake" register VTMP is set + bits<1> hasHvxTmp = 0; + let TSFlags{60} = hasHvxTmp; // vector register vX.tmp false-write bit CVINew = 0; let TSFlags{62} = CVINew; diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index b6984d40f78e..931b0c0e0090 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/MC/MCRegisterInfo.h" @@ -4655,3 +4656,11 @@ short HexagonInstrInfo::changeAddrMode_rr_ur(short Opc) const { short HexagonInstrInfo::changeAddrMode_ur_rr(short Opc) const { return Opc >= 0 ? Hexagon::changeAddrMode_ur_rr(Opc) : Opc; } + +MCInst HexagonInstrInfo::getNop() const { + static const MCInst Nop = MCInstBuilder(Hexagon::A2_nop); + + return MCInstBuilder(Hexagon::BUNDLE) + .addImm(0) + .addInst(&Nop); +} diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index eaaf9f7046c7..830f04d9eac3 100644 --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -524,6 +524,8 @@ public: short changeAddrMode_ur_rr(const MachineInstr &MI) const { return changeAddrMode_ur_rr(MI.getOpcode()); } + + MCInst getNop() const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp index 987c4a5fa6c4..d5c34ac467c3 100644 --- a/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp +++ b/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp @@ -104,6 +104,19 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, HexagonMCInstrInfo::setOuterLoop(MCB); return; } + if (MI->getOpcode() == Hexagon::PATCHABLE_FUNCTION_ENTER) { + AP.EmitSled(*MI, HexagonAsmPrinter::SledKind::FUNCTION_ENTER); + return; + } + if (MI->getOpcode() == Hexagon::PATCHABLE_FUNCTION_EXIT) { + AP.EmitSled(*MI, HexagonAsmPrinter::SledKind::FUNCTION_EXIT); + return; + } + if (MI->getOpcode() == Hexagon::PATCHABLE_TAIL_CALL) { + AP.EmitSled(*MI, HexagonAsmPrinter::SledKind::TAIL_CALL); + return; + } + MCInst *MCI = AP.OutContext.createMCInst(); MCI->setOpcode(MI->getOpcode()); assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) && diff --git a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp index 60d58f421bbb..53e82ac66b85 100644 --- a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp +++ b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -14,676 +14,44 @@ #include "HexagonMachineScheduler.h" #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/DFAPacketizer.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/RegisterClassInfo.h" -#include "llvm/CodeGen/RegisterPressure.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/CodeGen/TargetOpcodes.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/CodeGen/TargetSchedule.h" -#include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include <algorithm> -#include <cassert> -#include <iomanip> -#include <limits> -#include <memory> -#include <sstream> +#include "llvm/CodeGen/VLIWMachineScheduler.h" using namespace llvm; #define DEBUG_TYPE "machine-scheduler" -static cl::opt<bool> IgnoreBBRegPressure("ignore-bb-reg-pressure", - cl::Hidden, cl::ZeroOrMore, cl::init(false)); - -static cl::opt<bool> UseNewerCandidate("use-newer-candidate", - cl::Hidden, cl::ZeroOrMore, cl::init(true)); - -static cl::opt<unsigned> SchedDebugVerboseLevel("misched-verbose-level", - cl::Hidden, cl::ZeroOrMore, cl::init(1)); - -// Check if the scheduler should penalize instructions that are available to -// early due to a zero-latency dependence. -static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden, - cl::ZeroOrMore, cl::init(true)); - -// This value is used to determine if a register class is a high pressure set. -// We compute the maximum number of registers needed and divided by the total -// available. Then, we compare the result to this value. -static cl::opt<float> RPThreshold("hexagon-reg-pressure", cl::Hidden, - cl::init(0.75f), cl::desc("High register pressure threhold.")); - /// Return true if there is a dependence between SUd and SUu. -static bool hasDependence(const SUnit *SUd, const SUnit *SUu, - const HexagonInstrInfo &QII) { - if (SUd->Succs.size() == 0) - return false; +bool HexagonVLIWResourceModel::hasDependence(const SUnit *SUd, + const SUnit *SUu) { + const auto *QII = static_cast<const HexagonInstrInfo *>(TII); // Enable .cur formation. - if (QII.mayBeCurLoad(*SUd->getInstr())) + if (QII->mayBeCurLoad(*SUd->getInstr())) return false; - if (QII.canExecuteInBundle(*SUd->getInstr(), *SUu->getInstr())) - return false; - - for (const auto &S : SUd->Succs) { - // Since we do not add pseudos to packets, might as well - // ignore order dependencies. - if (S.isCtrl()) - continue; - - if (S.getSUnit() == SUu && S.getLatency() > 0) - return true; - } - return false; -} - -/// Check if scheduling of this SU is possible -/// in the current packet. -/// It is _not_ precise (statefull), it is more like -/// another heuristic. Many corner cases are figured -/// empirically. -bool VLIWResourceModel::isResourceAvailable(SUnit *SU, bool IsTop) { - if (!SU || !SU->getInstr()) + if (QII->canExecuteInBundle(*SUd->getInstr(), *SUu->getInstr())) return false; - // First see if the pipeline could receive this instruction - // in the current cycle. - switch (SU->getInstr()->getOpcode()) { - default: - if (!ResourcesModel->canReserveResources(*SU->getInstr())) - return false; - break; - case TargetOpcode::EXTRACT_SUBREG: - case TargetOpcode::INSERT_SUBREG: - case TargetOpcode::SUBREG_TO_REG: - case TargetOpcode::REG_SEQUENCE: - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::COPY: - case TargetOpcode::INLINEASM: - case TargetOpcode::INLINEASM_BR: - break; - } - - MachineBasicBlock *MBB = SU->getInstr()->getParent(); - auto &QST = MBB->getParent()->getSubtarget<HexagonSubtarget>(); - const auto &QII = *QST.getInstrInfo(); - - // Now see if there are no other dependencies to instructions already - // in the packet. - if (IsTop) { - for (unsigned i = 0, e = Packet.size(); i != e; ++i) - if (hasDependence(Packet[i], SU, QII)) - return false; - } else { - for (unsigned i = 0, e = Packet.size(); i != e; ++i) - if (hasDependence(SU, Packet[i], QII)) - return false; - } - return true; -} - -/// Keep track of available resources. -bool VLIWResourceModel::reserveResources(SUnit *SU, bool IsTop) { - bool startNewCycle = false; - // Artificially reset state. - if (!SU) { - ResourcesModel->clearResources(); - Packet.clear(); - TotalPackets++; - return false; - } - // If this SU does not fit in the packet or the packet is now full - // start a new one. - if (!isResourceAvailable(SU, IsTop) || - Packet.size() >= SchedModel->getIssueWidth()) { - ResourcesModel->clearResources(); - Packet.clear(); - TotalPackets++; - startNewCycle = true; - } - - switch (SU->getInstr()->getOpcode()) { - default: - ResourcesModel->reserveResources(*SU->getInstr()); - break; - case TargetOpcode::EXTRACT_SUBREG: - case TargetOpcode::INSERT_SUBREG: - case TargetOpcode::SUBREG_TO_REG: - case TargetOpcode::REG_SEQUENCE: - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - case TargetOpcode::CFI_INSTRUCTION: - case TargetOpcode::EH_LABEL: - case TargetOpcode::COPY: - case TargetOpcode::INLINEASM: - case TargetOpcode::INLINEASM_BR: - break; - } - Packet.push_back(SU); - -#ifndef NDEBUG - LLVM_DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n"); - for (unsigned i = 0, e = Packet.size(); i != e; ++i) { - LLVM_DEBUG(dbgs() << "\t[" << i << "] SU("); - LLVM_DEBUG(dbgs() << Packet[i]->NodeNum << ")\t"); - LLVM_DEBUG(Packet[i]->getInstr()->dump()); - } -#endif - - return startNewCycle; + return VLIWResourceModel::hasDependence(SUd, SUu); } -/// schedule - Called back from MachineScheduler::runOnMachineFunction -/// after setting up the current scheduling region. [RegionBegin, RegionEnd) -/// only includes instructions that have DAG nodes, not scheduling boundaries. -void VLIWMachineScheduler::schedule() { - LLVM_DEBUG(dbgs() << "********** MI Converging Scheduling VLIW " - << printMBBReference(*BB) << " " << BB->getName() - << " in_func " << BB->getParent()->getName() - << " at loop depth " << MLI->getLoopDepth(BB) << " \n"); - - buildDAGWithRegPressure(); - - Topo.InitDAGTopologicalSorting(); - - // Postprocess the DAG to add platform-specific artificial dependencies. - postprocessDAG(); - - SmallVector<SUnit*, 8> TopRoots, BotRoots; - findRootsAndBiasEdges(TopRoots, BotRoots); - - // Initialize the strategy before modifying the DAG. - SchedImpl->initialize(this); - - LLVM_DEBUG(unsigned maxH = 0; - for (unsigned su = 0, e = SUnits.size(); su != e; - ++su) if (SUnits[su].getHeight() > maxH) maxH = - SUnits[su].getHeight(); - dbgs() << "Max Height " << maxH << "\n";); - LLVM_DEBUG(unsigned maxD = 0; - for (unsigned su = 0, e = SUnits.size(); su != e; - ++su) if (SUnits[su].getDepth() > maxD) maxD = - SUnits[su].getDepth(); - dbgs() << "Max Depth " << maxD << "\n";); - LLVM_DEBUG(dump()); - - initQueues(TopRoots, BotRoots); - - bool IsTopNode = false; - while (true) { - LLVM_DEBUG( - dbgs() << "** VLIWMachineScheduler::schedule picking next node\n"); - SUnit *SU = SchedImpl->pickNode(IsTopNode); - if (!SU) break; - - if (!checkSchedLimit()) - break; - - scheduleMI(SU, IsTopNode); - - // Notify the scheduling strategy after updating the DAG. - SchedImpl->schedNode(SU, IsTopNode); - - updateQueues(SU, IsTopNode); - } - assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone."); - - placeDebugValues(); - - LLVM_DEBUG({ - dbgs() << "*** Final schedule for " - << printMBBReference(*begin()->getParent()) << " ***\n"; - dumpSchedule(); - dbgs() << '\n'; - }); +VLIWResourceModel *HexagonConvergingVLIWScheduler::createVLIWResourceModel( + const TargetSubtargetInfo &STI, const TargetSchedModel *SchedModel) const { + return new HexagonVLIWResourceModel(STI, SchedModel); } -void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) { - DAG = static_cast<VLIWMachineScheduler*>(dag); - SchedModel = DAG->getSchedModel(); - - Top.init(DAG, SchedModel); - Bot.init(DAG, SchedModel); - - // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or - // are disabled, then these HazardRecs will be disabled. - const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries(); - const TargetSubtargetInfo &STI = DAG->MF.getSubtarget(); - const TargetInstrInfo *TII = STI.getInstrInfo(); - delete Top.HazardRec; - delete Bot.HazardRec; - Top.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG); - Bot.HazardRec = TII->CreateTargetMIHazardRecognizer(Itin, DAG); - - delete Top.ResourceModel; - delete Bot.ResourceModel; - Top.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel()); - Bot.ResourceModel = new VLIWResourceModel(STI, DAG->getSchedModel()); - - const std::vector<unsigned> &MaxPressure = - DAG->getRegPressure().MaxSetPressure; - HighPressureSets.assign(MaxPressure.size(), 0); - for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) { - unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i); - HighPressureSets[i] = - ((float) MaxPressure[i] > ((float) Limit * RPThreshold)); - } - - assert((!ForceTopDown || !ForceBottomUp) && - "-misched-topdown incompatible with -misched-bottomup"); -} - -void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) { - for (const SDep &PI : SU->Preds) { - unsigned PredReadyCycle = PI.getSUnit()->TopReadyCycle; - unsigned MinLatency = PI.getLatency(); -#ifndef NDEBUG - Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency); -#endif - if (SU->TopReadyCycle < PredReadyCycle + MinLatency) - SU->TopReadyCycle = PredReadyCycle + MinLatency; - } - - if (!SU->isScheduled) - Top.releaseNode(SU, SU->TopReadyCycle); -} - -void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) { - assert(SU->getInstr() && "Scheduled SUnit must have instr"); - - for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle; - unsigned MinLatency = I->getLatency(); -#ifndef NDEBUG - Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency); -#endif - if (SU->BotReadyCycle < SuccReadyCycle + MinLatency) - SU->BotReadyCycle = SuccReadyCycle + MinLatency; - } - - if (!SU->isScheduled) - Bot.releaseNode(SU, SU->BotReadyCycle); -} - -/// Does this SU have a hazard within the current instruction group. -/// -/// The scheduler supports two modes of hazard recognition. The first is the -/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that -/// supports highly complicated in-order reservation tables -/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic. -/// -/// The second is a streamlined mechanism that checks for hazards based on -/// simple counters that the scheduler itself maintains. It explicitly checks -/// for instruction dispatch limitations, including the number of micro-ops that -/// can dispatch per cycle. -/// -/// TODO: Also check whether the SU must start a new group. -bool ConvergingVLIWScheduler::VLIWSchedBoundary::checkHazard(SUnit *SU) { - if (HazardRec->isEnabled()) - return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard; - - unsigned uops = SchedModel->getNumMicroOps(SU->getInstr()); - if (IssueCount + uops > SchedModel->getIssueWidth()) - return true; - - return false; -} - -void ConvergingVLIWScheduler::VLIWSchedBoundary::releaseNode(SUnit *SU, - unsigned ReadyCycle) { - if (ReadyCycle < MinReadyCycle) - MinReadyCycle = ReadyCycle; - - // Check for interlocks first. For the purpose of other heuristics, an - // instruction that cannot issue appears as if it's not in the ReadyQueue. - if (ReadyCycle > CurrCycle || checkHazard(SU)) - - Pending.push(SU); - else - Available.push(SU); -} - -/// Move the boundary of scheduled code by one cycle. -void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpCycle() { - unsigned Width = SchedModel->getIssueWidth(); - IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width; - - assert(MinReadyCycle < std::numeric_limits<unsigned>::max() && - "MinReadyCycle uninitialized"); - unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle); - - if (!HazardRec->isEnabled()) { - // Bypass HazardRec virtual calls. - CurrCycle = NextCycle; - } else { - // Bypass getHazardType calls in case of long latency. - for (; CurrCycle != NextCycle; ++CurrCycle) { - if (isTop()) - HazardRec->AdvanceCycle(); - else - HazardRec->RecedeCycle(); - } - } - CheckPending = true; - - LLVM_DEBUG(dbgs() << "*** Next cycle " << Available.getName() << " cycle " - << CurrCycle << '\n'); -} - -/// Move the boundary of scheduled code by one SUnit. -void ConvergingVLIWScheduler::VLIWSchedBoundary::bumpNode(SUnit *SU) { - bool startNewCycle = false; - - // Update the reservation table. - if (HazardRec->isEnabled()) { - if (!isTop() && SU->isCall) { - // Calls are scheduled with their preceding instructions. For bottom-up - // scheduling, clear the pipeline state before emitting. - HazardRec->Reset(); - } - HazardRec->EmitInstruction(SU); - } - - // Update DFA model. - startNewCycle = ResourceModel->reserveResources(SU, isTop()); - - // Check the instruction group dispatch limit. - // TODO: Check if this SU must end a dispatch group. - IssueCount += SchedModel->getNumMicroOps(SU->getInstr()); - if (startNewCycle) { - LLVM_DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n'); - bumpCycle(); - } - else - LLVM_DEBUG(dbgs() << "*** IssueCount " << IssueCount << " at cycle " - << CurrCycle << '\n'); -} - -/// Release pending ready nodes in to the available queue. This makes them -/// visible to heuristics. -void ConvergingVLIWScheduler::VLIWSchedBoundary::releasePending() { - // If the available queue is empty, it is safe to reset MinReadyCycle. - if (Available.empty()) - MinReadyCycle = std::numeric_limits<unsigned>::max(); - - // Check to see if any of the pending instructions are ready to issue. If - // so, add them to the available queue. - for (unsigned i = 0, e = Pending.size(); i != e; ++i) { - SUnit *SU = *(Pending.begin()+i); - unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle; - - if (ReadyCycle < MinReadyCycle) - MinReadyCycle = ReadyCycle; - - if (ReadyCycle > CurrCycle) - continue; - - if (checkHazard(SU)) - continue; - - Available.push(SU); - Pending.remove(Pending.begin()+i); - --i; --e; - } - CheckPending = false; -} - -/// Remove SU from the ready set for this boundary. -void ConvergingVLIWScheduler::VLIWSchedBoundary::removeReady(SUnit *SU) { - if (Available.isInQueue(SU)) - Available.remove(Available.find(SU)); - else { - assert(Pending.isInQueue(SU) && "bad ready count"); - Pending.remove(Pending.find(SU)); - } -} +int HexagonConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU, + SchedCandidate &Candidate, + RegPressureDelta &Delta, + bool verbose) { + int ResCount = + ConvergingVLIWScheduler::SchedulingCost(Q, SU, Candidate, Delta, verbose); -/// If this queue only has one ready candidate, return it. As a side effect, -/// advance the cycle until at least one node is ready. If multiple instructions -/// are ready, return NULL. -SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() { - if (CheckPending) - releasePending(); - - auto AdvanceCycle = [this]() { - if (Available.empty()) - return true; - if (Available.size() == 1 && Pending.size() > 0) - return !ResourceModel->isResourceAvailable(*Available.begin(), isTop()) || - getWeakLeft(*Available.begin(), isTop()) != 0; - return false; - }; - for (unsigned i = 0; AdvanceCycle(); ++i) { - assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) && - "permanent hazard"); (void)i; - ResourceModel->reserveResources(nullptr, isTop()); - bumpCycle(); - releasePending(); - } - if (Available.size() == 1) - return *Available.begin(); - return nullptr; -} - -#ifndef NDEBUG -void ConvergingVLIWScheduler::traceCandidate(const char *Label, - const ReadyQueue &Q, SUnit *SU, int Cost, PressureChange P) { - dbgs() << Label << " " << Q.getName() << " "; - if (P.isValid()) - dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":" - << P.getUnitInc() << " "; - else - dbgs() << " "; - dbgs() << "cost(" << Cost << ")\t"; - DAG->dumpNode(*SU); -} - -// Very detailed queue dump, to be used with higher verbosity levels. -void ConvergingVLIWScheduler::readyQueueVerboseDump( - const RegPressureTracker &RPTracker, SchedCandidate &Candidate, - ReadyQueue &Q) { - RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); - - dbgs() << ">>> " << Q.getName() << "\n"; - for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { - RegPressureDelta RPDelta; - TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, - DAG->getRegionCriticalPSets(), - DAG->getRegPressure().MaxSetPressure); - std::stringstream dbgstr; - dbgstr << "SU(" << std::setw(3) << (*I)->NodeNum << ")"; - dbgs() << dbgstr.str(); - SchedulingCost(Q, *I, Candidate, RPDelta, true); - dbgs() << "\t"; - (*I)->getInstr()->dump(); - } - dbgs() << "\n"; -} -#endif - -/// isSingleUnscheduledPred - If SU2 is the only unscheduled predecessor -/// of SU, return true (we may have duplicates) -static inline bool isSingleUnscheduledPred(SUnit *SU, SUnit *SU2) { - if (SU->NumPredsLeft == 0) - return false; - - for (auto &Pred : SU->Preds) { - // We found an available, but not scheduled, predecessor. - if (!Pred.getSUnit()->isScheduled && (Pred.getSUnit() != SU2)) - return false; - } - - return true; -} - -/// isSingleUnscheduledSucc - If SU2 is the only unscheduled successor -/// of SU, return true (we may have duplicates) -static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) { - if (SU->NumSuccsLeft == 0) - return false; - - for (auto &Succ : SU->Succs) { - // We found an available, but not scheduled, successor. - if (!Succ.getSUnit()->isScheduled && (Succ.getSUnit() != SU2)) - return false; - } - return true; -} - -/// Check if the instruction changes the register pressure of a register in the -/// high pressure set. The function returns a negative value if the pressure -/// decreases and a positive value is the pressure increases. If the instruction -/// doesn't use a high pressure register or doesn't change the register -/// pressure, then return 0. -int ConvergingVLIWScheduler::pressureChange(const SUnit *SU, bool isBotUp) { - PressureDiff &PD = DAG->getPressureDiff(SU); - for (auto &P : PD) { - if (!P.isValid()) - continue; - // The pressure differences are computed bottom-up, so the comparision for - // an increase is positive in the bottom direction, but negative in the - // top-down direction. - if (HighPressureSets[P.getPSet()]) - return (isBotUp ? P.getUnitInc() : -P.getUnitInc()); - } - return 0; -} - -// Constants used to denote relative importance of -// heuristic components for cost computation. -static const unsigned PriorityOne = 200; -static const unsigned PriorityTwo = 50; -static const unsigned PriorityThree = 75; -static const unsigned ScaleTwo = 10; - -/// Single point to compute overall scheduling cost. -/// TODO: More heuristics will be used soon. -int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU, - SchedCandidate &Candidate, - RegPressureDelta &Delta, - bool verbose) { - // Initial trivial priority. - int ResCount = 1; - - // Do not waste time on a node that is already scheduled. if (!SU || SU->isScheduled) return ResCount; - LLVM_DEBUG(if (verbose) dbgs() - << ((Q.getID() == TopQID) ? "(top|" : "(bot|")); - // Forced priority is high. - if (SU->isScheduleHigh) { - ResCount += PriorityOne; - LLVM_DEBUG(dbgs() << "H|"); - } - - unsigned IsAvailableAmt = 0; - // Critical path first. - if (Q.getID() == TopQID) { - if (Top.isLatencyBound(SU)) { - LLVM_DEBUG(if (verbose) dbgs() << "LB|"); - ResCount += (SU->getHeight() * ScaleTwo); - } - - LLVM_DEBUG(if (verbose) { - std::stringstream dbgstr; - dbgstr << "h" << std::setw(3) << SU->getHeight() << "|"; - dbgs() << dbgstr.str(); - }); - - // If resources are available for it, multiply the - // chance of scheduling. - if (Top.ResourceModel->isResourceAvailable(SU, true)) { - IsAvailableAmt = (PriorityTwo + PriorityThree); - ResCount += IsAvailableAmt; - LLVM_DEBUG(if (verbose) dbgs() << "A|"); - } else - LLVM_DEBUG(if (verbose) dbgs() << " |"); - } else { - if (Bot.isLatencyBound(SU)) { - LLVM_DEBUG(if (verbose) dbgs() << "LB|"); - ResCount += (SU->getDepth() * ScaleTwo); - } - - LLVM_DEBUG(if (verbose) { - std::stringstream dbgstr; - dbgstr << "d" << std::setw(3) << SU->getDepth() << "|"; - dbgs() << dbgstr.str(); - }); - - // If resources are available for it, multiply the - // chance of scheduling. - if (Bot.ResourceModel->isResourceAvailable(SU, false)) { - IsAvailableAmt = (PriorityTwo + PriorityThree); - ResCount += IsAvailableAmt; - LLVM_DEBUG(if (verbose) dbgs() << "A|"); - } else - LLVM_DEBUG(if (verbose) dbgs() << " |"); - } - - unsigned NumNodesBlocking = 0; - if (Q.getID() == TopQID) { - // How many SUs does it block from scheduling? - // Look at all of the successors of this node. - // Count the number of nodes that - // this node is the sole unscheduled node for. - if (Top.isLatencyBound(SU)) - for (const SDep &SI : SU->Succs) - if (isSingleUnscheduledPred(SI.getSUnit(), SU)) - ++NumNodesBlocking; - } else { - // How many unscheduled predecessors block this node? - if (Bot.isLatencyBound(SU)) - for (const SDep &PI : SU->Preds) - if (isSingleUnscheduledSucc(PI.getSUnit(), SU)) - ++NumNodesBlocking; - } - ResCount += (NumNodesBlocking * ScaleTwo); - - LLVM_DEBUG(if (verbose) { - std::stringstream dbgstr; - dbgstr << "blk " << std::setw(2) << NumNodesBlocking << ")|"; - dbgs() << dbgstr.str(); - }); - - // Factor in reg pressure as a heuristic. - if (!IgnoreBBRegPressure) { - // Decrease priority by the amount that register pressure exceeds the limit. - ResCount -= (Delta.Excess.getUnitInc()*PriorityOne); - // Decrease priority if register pressure exceeds the limit. - ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityOne); - // Decrease priority slightly if register pressure would increase over the - // current maximum. - ResCount -= (Delta.CurrentMax.getUnitInc()*PriorityTwo); - // If there are register pressure issues, then we remove the value added for - // the instruction being available. The rationale is that we really don't - // want to schedule an instruction that causes a spill. - if (IsAvailableAmt && pressureChange(SU, Q.getID() != TopQID) > 0 && - (Delta.Excess.getUnitInc() || Delta.CriticalMax.getUnitInc() || - Delta.CurrentMax.getUnitInc())) - ResCount -= IsAvailableAmt; - LLVM_DEBUG(if (verbose) { - dbgs() << "RP " << Delta.Excess.getUnitInc() << "/" - << Delta.CriticalMax.getUnitInc() << "/" - << Delta.CurrentMax.getUnitInc() << ")|"; - }); - } - - // Give a little extra priority to a .cur instruction if there is a resource - // available for it. auto &QST = DAG->MF.getSubtarget<HexagonSubtarget>(); auto &QII = *QST.getInstrInfo(); if (SU->isInstr() && QII.mayBeCurLoad(*SU->getInstr())) { @@ -698,303 +66,5 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU, } } - // Give preference to a zero latency instruction if the dependent - // instruction is in the current packet. - if (Q.getID() == TopQID && getWeakLeft(SU, true) == 0) { - for (const SDep &PI : SU->Preds) { - if (!PI.getSUnit()->getInstr()->isPseudo() && PI.isAssignedRegDep() && - PI.getLatency() == 0 && - Top.ResourceModel->isInPacket(PI.getSUnit())) { - ResCount += PriorityThree; - LLVM_DEBUG(if (verbose) dbgs() << "Z|"); - } - } - } else if (Q.getID() == BotQID && getWeakLeft(SU, false) == 0) { - for (const SDep &SI : SU->Succs) { - if (!SI.getSUnit()->getInstr()->isPseudo() && SI.isAssignedRegDep() && - SI.getLatency() == 0 && - Bot.ResourceModel->isInPacket(SI.getSUnit())) { - ResCount += PriorityThree; - LLVM_DEBUG(if (verbose) dbgs() << "Z|"); - } - } - } - - // If the instruction has a non-zero latency dependence with an instruction in - // the current packet, then it should not be scheduled yet. The case occurs - // when the dependent instruction is scheduled in a new packet, so the - // scheduler updates the current cycle and pending instructions become - // available. - if (CheckEarlyAvail) { - if (Q.getID() == TopQID) { - for (const auto &PI : SU->Preds) { - if (PI.getLatency() > 0 && - Top.ResourceModel->isInPacket(PI.getSUnit())) { - ResCount -= PriorityOne; - LLVM_DEBUG(if (verbose) dbgs() << "D|"); - } - } - } else { - for (const auto &SI : SU->Succs) { - if (SI.getLatency() > 0 && - Bot.ResourceModel->isInPacket(SI.getSUnit())) { - ResCount -= PriorityOne; - LLVM_DEBUG(if (verbose) dbgs() << "D|"); - } - } - } - } - - LLVM_DEBUG(if (verbose) { - std::stringstream dbgstr; - dbgstr << "Total " << std::setw(4) << ResCount << ")"; - dbgs() << dbgstr.str(); - }); - return ResCount; } - -/// Pick the best candidate from the top queue. -/// -/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during -/// DAG building. To adjust for the current scheduling location we need to -/// maintain the number of vreg uses remaining to be top-scheduled. -ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler:: -pickNodeFromQueue(VLIWSchedBoundary &Zone, const RegPressureTracker &RPTracker, - SchedCandidate &Candidate) { - ReadyQueue &Q = Zone.Available; - LLVM_DEBUG(if (SchedDebugVerboseLevel > 1) - readyQueueVerboseDump(RPTracker, Candidate, Q); - else Q.dump();); - - // getMaxPressureDelta temporarily modifies the tracker. - RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker); - - // BestSU remains NULL if no top candidates beat the best existing candidate. - CandResult FoundCandidate = NoCand; - for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) { - RegPressureDelta RPDelta; - TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta, - DAG->getRegionCriticalPSets(), - DAG->getRegPressure().MaxSetPressure); - - int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false); - - // Initialize the candidate if needed. - if (!Candidate.SU) { - LLVM_DEBUG(traceCandidate("DCAND", Q, *I, CurrentCost)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - Candidate.SCost = CurrentCost; - FoundCandidate = NodeOrder; - continue; - } - - // Choose node order for negative cost candidates. There is no good - // candidate in this case. - if (CurrentCost < 0 && Candidate.SCost < 0) { - if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) - || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) { - LLVM_DEBUG(traceCandidate("NCAND", Q, *I, CurrentCost)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - Candidate.SCost = CurrentCost; - FoundCandidate = NodeOrder; - } - continue; - } - - // Best cost. - if (CurrentCost > Candidate.SCost) { - LLVM_DEBUG(traceCandidate("CCAND", Q, *I, CurrentCost)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - Candidate.SCost = CurrentCost; - FoundCandidate = BestCost; - continue; - } - - // Choose an instruction that does not depend on an artificial edge. - unsigned CurrWeak = getWeakLeft(*I, (Q.getID() == TopQID)); - unsigned CandWeak = getWeakLeft(Candidate.SU, (Q.getID() == TopQID)); - if (CurrWeak != CandWeak) { - if (CurrWeak < CandWeak) { - LLVM_DEBUG(traceCandidate("WCAND", Q, *I, CurrentCost)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - Candidate.SCost = CurrentCost; - FoundCandidate = Weak; - } - continue; - } - - if (CurrentCost == Candidate.SCost && Zone.isLatencyBound(*I)) { - unsigned CurrSize, CandSize; - if (Q.getID() == TopQID) { - CurrSize = (*I)->Succs.size(); - CandSize = Candidate.SU->Succs.size(); - } else { - CurrSize = (*I)->Preds.size(); - CandSize = Candidate.SU->Preds.size(); - } - if (CurrSize > CandSize) { - LLVM_DEBUG(traceCandidate("SPCAND", Q, *I, CurrentCost)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - Candidate.SCost = CurrentCost; - FoundCandidate = BestCost; - } - // Keep the old candidate if it's a better candidate. That is, don't use - // the subsequent tie breaker. - if (CurrSize != CandSize) - continue; - } - - // Tie breaker. - // To avoid scheduling indeterminism, we need a tie breaker - // for the case when cost is identical for two nodes. - if (UseNewerCandidate && CurrentCost == Candidate.SCost) { - if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum) - || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) { - LLVM_DEBUG(traceCandidate("TCAND", Q, *I, CurrentCost)); - Candidate.SU = *I; - Candidate.RPDelta = RPDelta; - Candidate.SCost = CurrentCost; - FoundCandidate = NodeOrder; - continue; - } - } - - // Fall through to original instruction order. - // Only consider node order if Candidate was chosen from this Q. - if (FoundCandidate == NoCand) - continue; - } - return FoundCandidate; -} - -/// Pick the best candidate node from either the top or bottom queue. -SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) { - // Schedule as far as possible in the direction of no choice. This is most - // efficient, but also provides the best heuristics for CriticalPSets. - if (SUnit *SU = Bot.pickOnlyChoice()) { - LLVM_DEBUG(dbgs() << "Picked only Bottom\n"); - IsTopNode = false; - return SU; - } - if (SUnit *SU = Top.pickOnlyChoice()) { - LLVM_DEBUG(dbgs() << "Picked only Top\n"); - IsTopNode = true; - return SU; - } - SchedCandidate BotCand; - // Prefer bottom scheduling when heuristics are silent. - CandResult BotResult = pickNodeFromQueue(Bot, - DAG->getBotRPTracker(), BotCand); - assert(BotResult != NoCand && "failed to find the first candidate"); - - // If either Q has a single candidate that provides the least increase in - // Excess pressure, we can immediately schedule from that Q. - // - // RegionCriticalPSets summarizes the pressure within the scheduled region and - // affects picking from either Q. If scheduling in one direction must - // increase pressure for one of the excess PSets, then schedule in that - // direction first to provide more freedom in the other direction. - if (BotResult == SingleExcess || BotResult == SingleCritical) { - LLVM_DEBUG(dbgs() << "Prefered Bottom Node\n"); - IsTopNode = false; - return BotCand.SU; - } - // Check if the top Q has a better candidate. - SchedCandidate TopCand; - CandResult TopResult = pickNodeFromQueue(Top, - DAG->getTopRPTracker(), TopCand); - assert(TopResult != NoCand && "failed to find the first candidate"); - - if (TopResult == SingleExcess || TopResult == SingleCritical) { - LLVM_DEBUG(dbgs() << "Prefered Top Node\n"); - IsTopNode = true; - return TopCand.SU; - } - // If either Q has a single candidate that minimizes pressure above the - // original region's pressure pick it. - if (BotResult == SingleMax) { - LLVM_DEBUG(dbgs() << "Prefered Bottom Node SingleMax\n"); - IsTopNode = false; - return BotCand.SU; - } - if (TopResult == SingleMax) { - LLVM_DEBUG(dbgs() << "Prefered Top Node SingleMax\n"); - IsTopNode = true; - return TopCand.SU; - } - if (TopCand.SCost > BotCand.SCost) { - LLVM_DEBUG(dbgs() << "Prefered Top Node Cost\n"); - IsTopNode = true; - return TopCand.SU; - } - // Otherwise prefer the bottom candidate in node order. - LLVM_DEBUG(dbgs() << "Prefered Bottom in Node order\n"); - IsTopNode = false; - return BotCand.SU; -} - -/// Pick the best node to balance the schedule. Implements MachineSchedStrategy. -SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) { - if (DAG->top() == DAG->bottom()) { - assert(Top.Available.empty() && Top.Pending.empty() && - Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); - return nullptr; - } - SUnit *SU; - if (ForceTopDown) { - SU = Top.pickOnlyChoice(); - if (!SU) { - SchedCandidate TopCand; - CandResult TopResult = - pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand); - assert(TopResult != NoCand && "failed to find the first candidate"); - (void)TopResult; - SU = TopCand.SU; - } - IsTopNode = true; - } else if (ForceBottomUp) { - SU = Bot.pickOnlyChoice(); - if (!SU) { - SchedCandidate BotCand; - CandResult BotResult = - pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand); - assert(BotResult != NoCand && "failed to find the first candidate"); - (void)BotResult; - SU = BotCand.SU; - } - IsTopNode = false; - } else { - SU = pickNodeBidrectional(IsTopNode); - } - if (SU->isTopReady()) - Top.removeReady(SU); - if (SU->isBottomReady()) - Bot.removeReady(SU); - - LLVM_DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom") - << " Scheduling instruction in cycle " - << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << " (" - << reportPackets() << ")\n"; - DAG->dumpNode(*SU)); - return SU; -} - -/// Update the scheduler's state after scheduling a node. This is the same node -/// that was just returned by pickNode(). However, VLIWMachineScheduler needs -/// to update it's state based on the current cycle before MachineSchedStrategy -/// does. -void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) { - if (IsTopNode) { - Top.bumpNode(SU); - SU->TopReadyCycle = Top.CurrCycle; - } else { - Bot.bumpNode(SU); - SU->BotReadyCycle = Bot.CurrCycle; - } -} diff --git a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h index fb0a7abd339b..3d8f557dc787 100644 --- a/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h +++ b/llvm/lib/Target/Hexagon/HexagonMachineScheduler.h @@ -13,261 +13,28 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H #define LLVM_LIB_TARGET_HEXAGON_HEXAGONMACHINESCHEDULER_H -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/Twine.h" -#include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterPressure.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" -#include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" -#include <algorithm> -#include <cassert> -#include <limits> -#include <memory> -#include <vector> +#include "llvm/CodeGen/VLIWMachineScheduler.h" namespace llvm { class SUnit; -class VLIWResourceModel { - /// ResourcesModel - Represents VLIW state. - /// Not limited to VLIW targets per se, but assumes - /// definition of DFA by a target. - DFAPacketizer *ResourcesModel; - - const TargetSchedModel *SchedModel; - - /// Local packet/bundle model. Purely - /// internal to the MI schedulre at the time. - std::vector<SUnit *> Packet; - - /// Total packets created. - unsigned TotalPackets = 0; - +class HexagonVLIWResourceModel : public VLIWResourceModel { public: - VLIWResourceModel(const TargetSubtargetInfo &STI, const TargetSchedModel *SM) - : SchedModel(SM) { - ResourcesModel = STI.getInstrInfo()->CreateTargetScheduleState(STI); - - // This hard requirement could be relaxed, - // but for now do not let it proceed. - assert(ResourcesModel && "Unimplemented CreateTargetScheduleState."); - - Packet.resize(SchedModel->getIssueWidth()); - Packet.clear(); - ResourcesModel->clearResources(); - } - - ~VLIWResourceModel() { - delete ResourcesModel; - } - - void resetPacketState() { - Packet.clear(); - } - - void resetDFA() { - ResourcesModel->clearResources(); - } - - void reset() { - Packet.clear(); - ResourcesModel->clearResources(); - } - - bool isResourceAvailable(SUnit *SU, bool IsTop); - bool reserveResources(SUnit *SU, bool IsTop); - unsigned getTotalPackets() const { return TotalPackets; } - bool isInPacket(SUnit *SU) const { return is_contained(Packet, SU); } + using VLIWResourceModel::VLIWResourceModel; + bool hasDependence(const SUnit *SUd, const SUnit *SUu) override; }; -/// Extend the standard ScheduleDAGMI to provide more context and override the -/// top-level schedule() driver. -class VLIWMachineScheduler : public ScheduleDAGMILive { -public: - VLIWMachineScheduler(MachineSchedContext *C, - std::unique_ptr<MachineSchedStrategy> S) - : ScheduleDAGMILive(C, std::move(S)) {} - - /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's - /// time to do some work. - void schedule() override; - - RegisterClassInfo *getRegClassInfo() { return RegClassInfo; } - int getBBSize() { return BB->size(); } -}; - -//===----------------------------------------------------------------------===// -// ConvergingVLIWScheduler - Implementation of the standard -// MachineSchedStrategy. -//===----------------------------------------------------------------------===// - -/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics -/// to balance the schedule. -class ConvergingVLIWScheduler : public MachineSchedStrategy { - /// Store the state used by ConvergingVLIWScheduler heuristics, required - /// for the lifetime of one invocation of pickNode(). - struct SchedCandidate { - // The best SUnit candidate. - SUnit *SU = nullptr; - - // Register pressure values for the best candidate. - RegPressureDelta RPDelta; - - // Best scheduling cost. - int SCost = 0; - - SchedCandidate() = default; - }; - /// Represent the type of SchedCandidate found within a single queue. - enum CandResult { - NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure, - BestCost, Weak}; - - /// Each Scheduling boundary is associated with ready queues. It tracks the - /// current cycle in whichever direction at has moved, and maintains the state - /// of "hazards" and other interlocks at the current cycle. - struct VLIWSchedBoundary { - VLIWMachineScheduler *DAG = nullptr; - const TargetSchedModel *SchedModel = nullptr; - - ReadyQueue Available; - ReadyQueue Pending; - bool CheckPending = false; - - ScheduleHazardRecognizer *HazardRec = nullptr; - VLIWResourceModel *ResourceModel = nullptr; - - unsigned CurrCycle = 0; - unsigned IssueCount = 0; - unsigned CriticalPathLength = 0; - - /// MinReadyCycle - Cycle of the soonest available instruction. - unsigned MinReadyCycle = std::numeric_limits<unsigned>::max(); - - // Remember the greatest min operand latency. - unsigned MaxMinLatency = 0; - - /// Pending queues extend the ready queues with the same ID and the - /// PendingFlag set. - VLIWSchedBoundary(unsigned ID, const Twine &Name) - : Available(ID, Name+".A"), - Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P") {} - - ~VLIWSchedBoundary() { - delete ResourceModel; - delete HazardRec; - } - - void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) { - DAG = dag; - SchedModel = smodel; - CurrCycle = 0; - IssueCount = 0; - // Initialize the critical path length limit, which used by the scheduling - // cost model to determine the value for scheduling an instruction. We use - // a slightly different heuristic for small and large functions. For small - // functions, it's important to use the height/depth of the instruction. - // For large functions, prioritizing by height or depth increases spills. - CriticalPathLength = DAG->getBBSize() / SchedModel->getIssueWidth(); - if (DAG->getBBSize() < 50) - // We divide by two as a cheap and simple heuristic to reduce the - // critcal path length, which increases the priority of using the graph - // height/depth in the scheduler's cost computation. - CriticalPathLength >>= 1; - else { - // For large basic blocks, we prefer a larger critical path length to - // decrease the priority of using the graph height/depth. - unsigned MaxPath = 0; - for (auto &SU : DAG->SUnits) - MaxPath = std::max(MaxPath, isTop() ? SU.getHeight() : SU.getDepth()); - CriticalPathLength = std::max(CriticalPathLength, MaxPath) + 1; - } - } - - bool isTop() const { - return Available.getID() == ConvergingVLIWScheduler::TopQID; - } - - bool checkHazard(SUnit *SU); - - void releaseNode(SUnit *SU, unsigned ReadyCycle); - - void bumpCycle(); - - void bumpNode(SUnit *SU); - - void releasePending(); - - void removeReady(SUnit *SU); - - SUnit *pickOnlyChoice(); - - bool isLatencyBound(SUnit *SU) { - if (CurrCycle >= CriticalPathLength) - return true; - unsigned PathLength = isTop() ? SU->getHeight() : SU->getDepth(); - return CriticalPathLength - CurrCycle <= PathLength; - } - }; - - VLIWMachineScheduler *DAG = nullptr; - const TargetSchedModel *SchedModel = nullptr; - - // State of the top and bottom scheduled instruction boundaries. - VLIWSchedBoundary Top; - VLIWSchedBoundary Bot; - - /// List of pressure sets that have a high pressure level in the region. - std::vector<bool> HighPressureSets; - -public: - /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both) - enum { - TopQID = 1, - BotQID = 2, - LogMaxQID = 2 - }; - - ConvergingVLIWScheduler() : Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {} - - void initialize(ScheduleDAGMI *dag) override; - - SUnit *pickNode(bool &IsTopNode) override; - - void schedNode(SUnit *SU, bool IsTopNode) override; - - void releaseTopNode(SUnit *SU) override; - - void releaseBottomNode(SUnit *SU) override; - - unsigned reportPackets() { - return Top.ResourceModel->getTotalPackets() + - Bot.ResourceModel->getTotalPackets(); - } - +class HexagonConvergingVLIWScheduler : public ConvergingVLIWScheduler { protected: - SUnit *pickNodeBidrectional(bool &IsTopNode); - - int pressureChange(const SUnit *SU, bool isBotUp); - - int SchedulingCost(ReadyQueue &Q, - SUnit *SU, SchedCandidate &Candidate, - RegPressureDelta &Delta, bool verbose); - - CandResult pickNodeFromQueue(VLIWSchedBoundary &Zone, - const RegPressureTracker &RPTracker, - SchedCandidate &Candidate); -#ifndef NDEBUG - void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU, - int Cost, PressureChange P = PressureChange()); - - void readyQueueVerboseDump(const RegPressureTracker &RPTracker, - SchedCandidate &Candidate, ReadyQueue &Q); -#endif + VLIWResourceModel * + createVLIWResourceModel(const TargetSubtargetInfo &STI, + const TargetSchedModel *SchedModel) const override; + int SchedulingCost(ReadyQueue &Q, SUnit *SU, SchedCandidate &Candidate, + RegPressureDelta &Delta, bool verbose) override; }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/HexagonPseudo.td b/llvm/lib/Target/Hexagon/HexagonPseudo.td index 11f8af7c41a0..afd63d6d4aa7 100644 --- a/llvm/lib/Target/Hexagon/HexagonPseudo.td +++ b/llvm/lib/Target/Hexagon/HexagonPseudo.td @@ -572,3 +572,14 @@ defm PS_storerd : NewCircularStore<DoubleRegs, WordAccess>; // __builtin_trap. let hasSideEffects = 1, isPseudo = 1, isCodeGenOnly = 1, isSolo = 1 in def PS_crash: InstHexagon<(outs), (ins), "", [], "", PSEUDO, TypePSEUDO>; + +// This is actual trap1 instruction from before v65. It's here since it is +// no longer included in DepInstrInfo.td. +def PS_trap1 : HInst<(outs), (ins u8_0Imm:$Ii), "trap1(#$Ii)", tc_53c851ab, + TypeJ>, Enc_a51a9a, Requires<[HasPreV65]> { + let Inst{1-0} = 0b00; + let Inst{7-5} = 0b000; + let Inst{13-13} = 0b0; + let Inst{31-16} = 0b0101010010000000; +} + diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td index 88d775f16a7f..931578c9e78d 100644 --- a/llvm/lib/Target/Hexagon/HexagonSchedule.td +++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td @@ -69,3 +69,4 @@ include "HexagonScheduleV66.td" include "HexagonScheduleV67.td" include "HexagonScheduleV67T.td" include "HexagonScheduleV68.td" +include "HexagonScheduleV69.td" diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV69.td b/llvm/lib/Target/Hexagon/HexagonScheduleV69.td new file mode 100644 index 000000000000..ddd246866e20 --- /dev/null +++ b/llvm/lib/Target/Hexagon/HexagonScheduleV69.td @@ -0,0 +1,40 @@ +//=-HexagonScheduleV69.td - HexagonV69 Scheduling Definitions *- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// +// ScalarItin and HVXItin contain some old itineraries +// still used by a handful of instructions. Hopefully, we will be able +// to get rid of them soon. +def HexagonV69ItinList : DepScalarItinV69, ScalarItin, + DepHVXItinV69, HVXItin, PseudoItin { + list<InstrItinData> ItinList = + !listconcat(DepScalarItinV69_list, ScalarItin_list, + DepHVXItinV69_list, HVXItin_list, PseudoItin_list); +} + +def HexagonItinerariesV69 : + ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, + CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, + CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL, + CVI_ALL_NOMEM, CVI_ZW], + [Hex_FWD, HVX_FWD], + HexagonV69ItinList.ItinList>; + +def HexagonModelV69 : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItinerariesV69; + let LoadLatency = 1; + let CompleteModel = 0; +} + +//===----------------------------------------------------------------------===// +// Hexagon V69 Resource Definitions - +//===----------------------------------------------------------------------===// + diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index ecb2f88d8096..08bb4580b585 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -75,6 +75,10 @@ static cl::opt<bool> EnableCheckBankConflict("hexagon-check-bank-conflict", cl::Hidden, cl::ZeroOrMore, cl::init(true), cl::desc("Enable checking for cache bank conflicts")); +static cl::opt<bool> EnableV68FloatCodeGen( + "force-hvx-float", cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Enable the code-generation for vector float instructions on v68.")); + HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS, const TargetMachine &TM) : HexagonGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), @@ -103,13 +107,71 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { UseAudioOps = false; UseLongCalls = false; - UseBSBScheduling = hasV60Ops() && EnableBSBSched; + SubtargetFeatures Features(FS); + + // Turn on QFloat if the HVX version is v68+. + // The function ParseSubtargetFeatures will set feature bits and initialize + // subtarget's variables all in one, so there isn't a good way to preprocess + // the feature string, other than by tinkering with it directly. + auto IsQFloatFS = [](StringRef F) { + return F == "+hvx-qfloat" || F == "-hvx-qfloat"; + }; + if (!llvm::count_if(Features.getFeatures(), IsQFloatFS)) { + auto getHvxVersion = [&Features](StringRef FS) -> StringRef { + for (StringRef F : llvm::reverse(Features.getFeatures())) { + if (F.startswith("+hvxv")) + return F; + } + for (StringRef F : llvm::reverse(Features.getFeatures())) { + if (F == "-hvx") + return StringRef(); + if (F.startswith("+hvx") || F == "-hvx") + return F.take_front(4); // Return "+hvx" or "-hvx". + } + return StringRef(); + }; + + bool AddQFloat = false; + StringRef HvxVer = getHvxVersion(FS); + if (HvxVer.startswith("+hvxv")) { + int Ver = 0; + if (!HvxVer.drop_front(5).consumeInteger(10, Ver) && Ver >= 68) + AddQFloat = true; + } else if (HvxVer == "+hvx") { + if (hasV68Ops()) + AddQFloat = true; + } - ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS); + if (AddQFloat) + Features.AddFeature("+hvx-qfloat"); + } + + std::string FeatureString = Features.getString(); + ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FeatureString); + + // Enable float code generation only if the flag(s) are set and + // the feature is enabled. v68 is guarded by additional flags. + bool GreaterThanV68 = false; + if (useHVXV69Ops()) + GreaterThanV68 = true; + + // Support for deprecated qfloat/ieee codegen flags + if (!GreaterThanV68) { + if (EnableV68FloatCodeGen) + UseHVXFloatingPoint = true; + } else { + UseHVXFloatingPoint = true; + } + + if (UseHVXQFloatOps && UseHVXIEEEFPOps && UseHVXFloatingPoint) + LLVM_DEBUG( + dbgs() << "Behavior is undefined for simultaneous qfloat and ieee hvx codegen..."); if (OverrideLongCalls.getPosition()) UseLongCalls = OverrideLongCalls; + UseBSBScheduling = hasV60Ops() && EnableBSBSched; + if (isTinyCore()) { // Tiny core has a single thread, so back-to-back scheduling is enabled by // default. @@ -117,10 +179,10 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { UseBSBScheduling = false; } - FeatureBitset Features = getFeatureBits(); + FeatureBitset FeatureBits = getFeatureBits(); if (HexagonDisableDuplex) - setFeatureBits(Features.reset(Hexagon::FeatureDuplex)); - setFeatureBits(Hexagon_MC::completeHVXFeatures(Features)); + setFeatureBits(FeatureBits.reset(Hexagon::FeatureDuplex)); + setFeatureBits(Hexagon_MC::completeHVXFeatures(FeatureBits)); return *this; } diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index a4f2e159bf4b..e4f375440be1 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -56,6 +56,10 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo { bool UseSmallData = false; bool UseUnsafeMath = false; bool UseZRegOps = false; + bool UseHVXIEEEFPOps = false; + bool UseHVXQFloatOps = false; + bool UseHVXFloatingPoint = false; + bool UseCabac = false; bool HasPreV65 = false; bool HasMemNoShuf = false; @@ -138,6 +142,8 @@ public: /// subtarget options. Definition of function is auto generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); + bool isXRaySupported() const override { return true; } + bool hasV5Ops() const { return getHexagonArchVersion() >= Hexagon::ArchEnum::V5; } @@ -186,6 +192,12 @@ public: bool hasV68OpsOnly() const { return getHexagonArchVersion() == Hexagon::ArchEnum::V68; } + bool hasV69Ops() const { + return getHexagonArchVersion() >= Hexagon::ArchEnum::V69; + } + bool hasV69OpsOnly() const { + return getHexagonArchVersion() == Hexagon::ArchEnum::V69; + } bool useAudioOps() const { return UseAudioOps; } bool useCompound() const { return UseCompound; } @@ -197,10 +209,16 @@ public: bool useSmallData() const { return UseSmallData; } bool useUnsafeMath() const { return UseUnsafeMath; } bool useZRegOps() const { return UseZRegOps; } + bool useCabac() const { return UseCabac; } bool isTinyCore() const { return HexagonProcFamily == TinyCore; } bool isTinyCoreWithDuplex() const { return isTinyCore() && EnableDuplex; } + bool useHVXIEEEFPOps() const { return UseHVXIEEEFPOps && useHVXOps(); } + bool useHVXQFloatOps() const { + return UseHVXQFloatOps && HexagonHVXVersion >= Hexagon::ArchEnum::V68; + } + bool useHVXFloatingPoint() const { return UseHVXFloatingPoint; } bool useHVXOps() const { return HexagonHVXVersion > Hexagon::ArchEnum::NoArch; } @@ -222,6 +240,9 @@ public: bool useHVXV68Ops() const { return HexagonHVXVersion >= Hexagon::ArchEnum::V68; } + bool useHVXV69Ops() const { + return HexagonHVXVersion >= Hexagon::ArchEnum::V69; + } bool useHVX128BOps() const { return useHVXOps() && UseHVX128BOps; } bool useHVX64BOps() const { return useHVXOps() && UseHVX64BOps; } @@ -281,7 +302,11 @@ public: } ArrayRef<MVT> getHVXElementTypes() const { - static MVT Types[] = { MVT::i8, MVT::i16, MVT::i32 }; + static MVT Types[] = {MVT::i8, MVT::i16, MVT::i32}; + static MVT TypesV68[] = {MVT::i8, MVT::i16, MVT::i32, MVT::f16, MVT::f32}; + + if (useHVXV68Ops() && useHVXFloatingPoint()) + return makeArrayRef(TypesV68); return makeArrayRef(Types); } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index 66de698182d7..fcf829b522cc 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -21,6 +21,7 @@ #include "TargetInfo/HexagonTargetInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/VLIWMachineScheduler.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/MC/TargetRegistry.h" @@ -120,8 +121,8 @@ extern "C" int HexagonTargetMachineModule; int HexagonTargetMachineModule = 0; static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) { - ScheduleDAGMILive *DAG = - new VLIWMachineScheduler(C, std::make_unique<ConvergingVLIWScheduler>()); + ScheduleDAGMILive *DAG = new VLIWMachineScheduler( + C, std::make_unique<HexagonConvergingVLIWScheduler>()); DAG->addMutation(std::make_unique<HexagonSubtarget::UsrOverflowMutation>()); DAG->addMutation(std::make_unique<HexagonSubtarget::HVXMemLatencyMutation>()); DAG->addMutation(std::make_unique<HexagonSubtarget::CallMutation>()); diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 1d325553f45a..85ec0cdcd8f0 100644 --- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -294,7 +294,7 @@ bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) { bool Avail = ResourceTracker->canReserveResources(*ExtMI); if (Reserve && Avail) ResourceTracker->reserveResources(*ExtMI); - MF.DeleteMachineInstr(ExtMI); + MF.deleteMachineInstr(ExtMI); return Avail; } @@ -890,7 +890,7 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI, const MCInstrDesc &D = HII->get(NewOpcode); MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc()); bool ResourcesAvailable = ResourceTracker->canReserveResources(*NewMI); - MF.DeleteMachineInstr(NewMI); + MF.deleteMachineInstr(NewMI); if (!ResourcesAvailable) return false; @@ -1082,6 +1082,11 @@ bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) { if (HII->isSolo(MI)) return true; + if (MI.getOpcode() == Hexagon::PATCHABLE_FUNCTION_ENTER || + MI.getOpcode() == Hexagon::PATCHABLE_FUNCTION_EXIT || + MI.getOpcode() == Hexagon::PATCHABLE_TAIL_CALL) + return true; + if (MI.getOpcode() == Hexagon::A2_nop) return true; diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index ea2798a3b44e..21386a91c7b3 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -536,7 +536,7 @@ auto AlignVectors::createAddressGroups() -> bool { erase_if(AddrGroups, [](auto &G) { return G.second.size() == 1; }); // Remove groups that don't use HVX types. erase_if(AddrGroups, [&](auto &G) { - return !llvm::any_of( + return llvm::none_of( G.second, [&](auto &I) { return HVC.HST.isTypeForHVX(I.ValTy); }); }); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index 4125566bc58a..c9a1781a4543 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -154,9 +154,8 @@ namespace HexagonII { PrefersSlot3Pos = 57, PrefersSlot3Mask = 0x1, - // v65 - HasTmpDstPos = 60, - HasTmpDstMask = 0x1, + HasHvxTmpPos = 60, + HasHvxTmpMask = 0x1, CVINewPos = 62, CVINewMask = 0x1, diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp index fee1acdbbe8a..96c2965296ca 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -98,6 +98,10 @@ void HexagonMCChecker::init(MCInst const &MCI) { for (unsigned i = 0; i < MCID.getNumImplicitUses(); ++i) initReg(MCI, MCID.getImplicitUses()[i], PredReg, isTrue); + const bool IgnoreTmpDst = (HexagonMCInstrInfo::hasTmpDst(MCII, MCI) || + HexagonMCInstrInfo::hasHvxTmp(MCII, MCI)) && + STI.getFeatureBits()[Hexagon::ArchV69]; + // Get implicit register definitions. if (const MCPhysReg *ImpDef = MCID.getImplicitDefs()) for (; *ImpDef; ++ImpDef) { @@ -123,7 +127,7 @@ void HexagonMCChecker::init(MCInst const &MCI) { HexagonMCInstrInfo::isPredicateLate(MCII, MCI)) // Include implicit late predicates. LatePreds.insert(R); - else + else if (!IgnoreTmpDst) Defs[R].insert(PredSense(PredReg, isTrue)); } @@ -178,7 +182,7 @@ void HexagonMCChecker::init(MCInst const &MCI) { // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and // destination registers with this instruction. same for vdeal(Vx,Vy,Rx) Uses.insert(*SRI); - else + else if (!IgnoreTmpDst) Defs[*SRI].insert(PredSense(PredReg, isTrue)); } } @@ -227,9 +231,11 @@ bool HexagonMCChecker::check(bool FullCheck) { bool chkAXOK = checkAXOK(); bool chkCofMax1 = checkCOFMax1(); bool chkHWLoop = checkHWLoop(); + bool chkValidTmpDst = FullCheck ? checkValidTmpDst() : true; bool chkLegalVecRegPair = checkLegalVecRegPair(); bool chk = chkP && chkNV && chkR && chkRRO && chkS && chkSh && chkSl && - chkAXOK && chkCofMax1 && chkHWLoop && chkLegalVecRegPair; + chkAXOK && chkCofMax1 && chkHWLoop && chkValidTmpDst && + chkLegalVecRegPair; return chk; } @@ -676,6 +682,32 @@ bool HexagonMCChecker::checkShuffle() { return MCSDX.check(); } +bool HexagonMCChecker::checkValidTmpDst() { + if (!STI.getFeatureBits()[Hexagon::ArchV69]) { + return true; + } + auto HasTmp = [&](MCInst const &I) { + return HexagonMCInstrInfo::hasTmpDst(MCII, I) || + HexagonMCInstrInfo::hasHvxTmp(MCII, I); + }; + unsigned HasTmpCount = + llvm::count_if(HexagonMCInstrInfo::bundleInstructions(MCII, MCB), HasTmp); + + if (HasTmpCount > 1) { + reportError( + MCB.getLoc(), + "this packet has more than one HVX vtmp/.tmp destination instruction"); + + for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) + if (HasTmp(I)) + reportNote(I.getLoc(), + "this is an HVX vtmp/.tmp destination instruction"); + + return false; + } + return true; +} + void HexagonMCChecker::compoundRegisterMap(unsigned &Register) { switch (Register) { default: diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h index 00afdb664ba5..dbd3d8ae45e6 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -99,6 +99,7 @@ class HexagonMCChecker { bool checkHWLoop(); bool checkCOFMax1(); bool checkLegalVecRegPair(); + bool checkValidTmpDst(); static void compoundRegisterMap(unsigned &); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp index fa12fe1da448..68ccb20f4f15 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp @@ -939,10 +939,24 @@ bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII, return (F >> HexagonII::PrefersSlot3Pos) & HexagonII::PrefersSlot3Mask; } -/// return true if instruction has hasTmpDst attribute. bool HexagonMCInstrInfo::hasTmpDst(MCInstrInfo const &MCII, MCInst const &MCI) { + switch (MCI.getOpcode()) { + default: + return false; + case Hexagon::V6_vgathermh: + case Hexagon::V6_vgathermhq: + case Hexagon::V6_vgathermhw: + case Hexagon::V6_vgathermhwq: + case Hexagon::V6_vgathermw: + case Hexagon::V6_vgathermwq: + return true; + } + return false; +} + +bool HexagonMCInstrInfo::hasHvxTmp(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; - return (F >> HexagonII::HasTmpDstPos) & HexagonII::HasTmpDstMask; + return (F >> HexagonII::HasHvxTmpPos) & HexagonII::HasHvxTmpMask; } bool HexagonMCInstrInfo::requiresSlot(MCSubtargetInfo const &STI, diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h index 7b3c079880f8..5c56db14798f 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h @@ -41,7 +41,8 @@ public: namespace Hexagon { -class PacketIterator { +class PacketIterator : public std::iterator<std::forward_iterator_tag, + PacketIterator> { MCInstrInfo const &MCII; MCInst::const_iterator BundleCurrent; MCInst::const_iterator BundleEnd; @@ -188,6 +189,7 @@ bool hasImmExt(MCInst const &MCI); bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI); bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI); bool hasTmpDst(MCInstrInfo const &MCII, MCInst const &MCI); +bool hasHvxTmp(MCInstrInfo const &MCII, MCInst const &MCI); unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb); int64_t minConstant(MCInst const &MCI, size_t Index); diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index d832a756cb92..dfdddb50657c 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -80,6 +80,8 @@ cl::opt<bool> MV67T("mv67t", cl::Hidden, cl::desc("Build for Hexagon V67T"), cl::init(false)); cl::opt<bool> MV68("mv68", cl::Hidden, cl::desc("Build for Hexagon V68"), cl::init(false)); +cl::opt<bool> MV69("mv69", cl::Hidden, cl::desc("Build for Hexagon V69"), + cl::init(false)); cl::opt<Hexagon::ArchEnum> EnableHVX("mhvx", @@ -91,6 +93,7 @@ cl::opt<Hexagon::ArchEnum> clEnumValN(Hexagon::ArchEnum::V66, "v66", "Build for HVX v66"), clEnumValN(Hexagon::ArchEnum::V67, "v67", "Build for HVX v67"), clEnumValN(Hexagon::ArchEnum::V68, "v68", "Build for HVX v68"), + clEnumValN(Hexagon::ArchEnum::V69, "v69", "Build for HVX v69"), // Sentinel for no value specified. clEnumValN(Hexagon::ArchEnum::Generic, "", "")), // Sentinel for flag not present. @@ -101,6 +104,11 @@ static cl::opt<bool> DisableHVX("mno-hvx", cl::Hidden, cl::desc("Disable Hexagon Vector eXtensions")); +static cl::opt<bool> + EnableHvxIeeeFp("mhvx-ieee-fp", cl::Hidden, + cl::desc("Enable HVX IEEE floating point extensions")); +static cl::opt<bool> EnableHexagonCabac + ("mcabac", cl::desc("tbd"), cl::init(false)); static StringRef DefaultArch = "hexagonv60"; @@ -123,6 +131,8 @@ static StringRef HexagonGetArchVariant() { return "hexagonv67t"; if (MV68) return "hexagonv68"; + if (MV69) + return "hexagonv69"; return ""; } @@ -371,6 +381,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) { case Hexagon::ArchEnum::V68: Result.push_back("+hvxv68"); break; + case Hexagon::ArchEnum::V69: + Result.push_back("+hvxv69"); + break; case Hexagon::ArchEnum::Generic:{ Result.push_back(StringSwitch<StringRef>(CPU) .Case("hexagonv60", "+hvxv60") @@ -379,13 +392,19 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) { .Case("hexagonv66", "+hvxv66") .Case("hexagonv67", "+hvxv67") .Case("hexagonv67t", "+hvxv67") - .Case("hexagonv68", "+hvxv68")); + .Case("hexagonv68", "+hvxv68") + .Case("hexagonv69", "+hvxv69")); break; } case Hexagon::ArchEnum::NoArch: // Sentinel if -mhvx isn't specified break; } + if (EnableHvxIeeeFp) + Result.push_back("+hvx-ieee-fp"); + if (EnableHexagonCabac) + Result.push_back("+cabac"); + return join(Result.begin(), Result.end(), ","); } } @@ -422,8 +441,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { // turns on hvxvNN, corresponding to the existing ArchVNN. FeatureBitset FB = S; unsigned CpuArch = ArchV5; - for (unsigned F : {ArchV68, ArchV67, ArchV66, ArchV65, ArchV62, ArchV60, - ArchV55, ArchV5}) { + for (unsigned F : {ArchV69, ArchV68, ArchV67, ArchV66, ArchV65, ArchV62, + ArchV60, ArchV55, ArchV5}) { if (!FB.test(F)) continue; CpuArch = F; @@ -438,7 +457,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { } bool HasHvxVer = false; for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, - ExtensionHVXV66, ExtensionHVXV67, ExtensionHVXV68}) { + ExtensionHVXV66, ExtensionHVXV67, ExtensionHVXV68, + ExtensionHVXV69}) { if (!FB.test(F)) continue; HasHvxVer = true; @@ -451,6 +471,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) { // HasHvxVer is false, and UseHvx is true. switch (CpuArch) { + case ArchV69: + FB.set(ExtensionHVXV69); + LLVM_FALLTHROUGH; case ArchV68: FB.set(ExtensionHVXV68); LLVM_FALLTHROUGH; @@ -538,6 +561,7 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) { {"hexagonv67", ELF::EF_HEXAGON_MACH_V67}, {"hexagonv67t", ELF::EF_HEXAGON_MACH_V67T}, {"hexagonv68", ELF::EF_HEXAGON_MACH_V68}, + {"hexagonv69", ELF::EF_HEXAGON_MACH_V69}, }; auto F = ElfFlags.find(STI.getCPU()); diff --git a/llvm/lib/Target/M68k/M68kInstrControl.td b/llvm/lib/Target/M68k/M68kInstrControl.td index 708474726861..9f87833ab0e2 100644 --- a/llvm/lib/Target/M68k/M68kInstrControl.td +++ b/llvm/lib/Target/M68k/M68kInstrControl.td @@ -118,13 +118,13 @@ def SET#"p8"#cc : MxSccM<cc, MxType8.POp, MxType8.PPat, MxEncEAp_0, MxExtI16_0>; /// 0 1 0 0 1 1 1 0 1 1 | MODE | REG ///------------------------------+---------+--------- let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in -class MxJMP<MxOperand LOCOp, ComplexPattern LOCPat, MxEncEA EA, MxEncExt EXT> +class MxJMP<MxOperand LOCOp, MxEncEA EA, MxEncExt EXT> : MxInst<(outs), (ins LOCOp:$dst), "jmp\t$dst", [(brind iPTR:$dst)], MxEncoding<EA.Reg, EA.DA, EA.Mode, MxBead2Bits<0b11>, MxBead4Bits<0b1110>, MxBead4Bits<0b0100>, EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>>; -def JMP32j : MxJMP<MxARI32, MxCP_ARI, MxEncEAj_0, MxExtEmpty>; +def JMP32j : MxJMP<MxARI32, MxEncEAj_0, MxExtEmpty>; // FIXME Support 16 bit indirect jump. @@ -147,17 +147,17 @@ def JMP32j : MxJMP<MxARI32, MxCP_ARI, MxEncEAj_0, MxExtEmpty>; /// 32-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $FF /// -------------------------------------------------- let isBranch = 1, isTerminator = 1, Uses = [CCR] in -class MxBcc<string cc, Operand TARGET, MxType TYPE, MxEncoding ENC = MxEncEmpty> +class MxBcc<string cc, Operand TARGET, MxEncoding ENC = MxEncEmpty> : MxInst<(outs), (ins TARGET:$dst), "b"#cc#"\t$dst", [], ENC>; foreach cc = [ "cc", "ls", "lt", "eq", "mi", "ne", "ge", "cs", "pl", "gt", "hi", "vc", "le", "vs"] in { def B#cc#"8" - : MxBcc<cc, MxBrTarget8, MxType8, + : MxBcc<cc, MxBrTarget8, MxEncoding<MxBead8Disp<0>, !cast<MxBead4Bits>("MxCC"#cc), MxBead4Bits<0x6>>>; def B#cc#"16" - : MxBcc<cc, MxBrTarget16, MxType16, + : MxBcc<cc, MxBrTarget16, MxEncoding<MxBead4Bits<0x0>, MxBead4Bits<0x0>, !cast<MxBead4Bits>("MxCC"#cc), MxBead4Bits<0x6>, MxBead16Imm<0>>>; @@ -179,13 +179,13 @@ def : Pat<(MxBrCond bb:$target, !cast<PatLeaf>("MxCOND"#cc), CCR), /// 32-BIT DISPLACEMENT IF 8-BIT DISPLACEMENT = $FF /// ------------------------------------------------- let isBranch = 1, isTerminator = 1, isBarrier=1 in -class MxBra<Operand TARGET, MxType TYPE, MxEncoding ENC = MxEncEmpty> +class MxBra<Operand TARGET, MxEncoding ENC = MxEncEmpty> : MxInst<(outs), (ins TARGET:$dst), "bra\t$dst", [], ENC>; -def BRA8 : MxBra<MxBrTarget8, MxType8, +def BRA8 : MxBra<MxBrTarget8, MxEncoding<MxBead8Disp<0>, MxBead4Bits<0x0>, MxBead4Bits<0x6>>>; -def BRA16 : MxBra<MxBrTarget16, MxType16, +def BRA16 : MxBra<MxBrTarget16, MxEncoding<MxBead4Bits<0x0>, MxBead4Bits<0x0>, MxBead4Bits<0x0>, MxBead4Bits<0x6>, MxBead16Imm<0>>>; diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp index 2a77a150f9aa..4ef9a567d453 100644 --- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -189,8 +189,8 @@ bool MSP430FrameLowering::spillCalleeSavedRegisters( MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>(); MFI->setCalleeSavedFrameSize(CSI.size() * 2); - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); + for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { + unsigned Reg = I.getReg(); // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); BuildMI(MBB, MI, DL, TII.get(MSP430::PUSH16r)) diff --git a/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/llvm/lib/Target/Mips/Mips16HardFloat.cpp index 203e05dde7ad..419f0ac1a8a7 100644 --- a/llvm/lib/Target/Mips/Mips16HardFloat.cpp +++ b/llvm/lib/Target/Mips/Mips16HardFloat.cpp @@ -479,14 +479,12 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV, // remove the use-soft-float attribute static void removeUseSoftFloat(Function &F) { - AttrBuilder B; LLVM_DEBUG(errs() << "removing -use-soft-float\n"); - B.addAttribute("use-soft-float", "false"); - F.removeFnAttrs(B); + F.removeFnAttr("use-soft-float"); if (F.hasFnAttribute("use-soft-float")) { LLVM_DEBUG(errs() << "still has -use-soft-float\n"); } - F.addFnAttrs(B); + F.addFnAttr("use-soft-float", "false"); } // This pass only makes sense when the underlying chip has floating point but diff --git a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp index aa8e298fa759..4e9a23d077da 100644 --- a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp +++ b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp @@ -36,7 +36,7 @@ /// /// Regarding compact branch hazard prevention: /// -/// Hazards handled: forbidden slots for MIPSR6. +/// Hazards handled: forbidden slots for MIPSR6, FPU slots for MIPS3 and below. /// /// A forbidden slot hazard occurs when a compact branch instruction is executed /// and the adjacent instruction in memory is a control transfer instruction @@ -160,7 +160,10 @@ private: bool buildProperJumpMI(MachineBasicBlock *MBB, MachineBasicBlock::iterator Pos, DebugLoc DL); void expandToLongBranch(MBBInfo &Info); + template <typename Pred, typename Safe> + bool handleSlot(Pred Predicate, Safe SafeInSlot); bool handleForbiddenSlot(); + bool handleFPUDelaySlot(); bool handlePossibleLongBranch(); const MipsSubtarget *STI; @@ -738,30 +741,27 @@ static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) { MBB.removeLiveIn(Mips::V0); } -bool MipsBranchExpansion::handleForbiddenSlot() { - // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6. - if (!STI->hasMips32r6() || STI->inMicroMipsMode()) - return false; - +template <typename Pred, typename Safe> +bool MipsBranchExpansion::handleSlot(Pred Predicate, Safe SafeInSlot) { bool Changed = false; for (MachineFunction::iterator FI = MFp->begin(); FI != MFp->end(); ++FI) { for (Iter I = FI->begin(); I != FI->end(); ++I) { - // Forbidden slot hazard handling. Use lookahead over state. - if (!TII->HasForbiddenSlot(*I)) + // Delay slot hazard handling. Use lookahead over state. + if (!Predicate(*I)) continue; - Iter Inst; + Iter IInSlot; bool LastInstInFunction = std::next(I) == FI->end() && std::next(FI) == MFp->end(); if (!LastInstInFunction) { std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI); LastInstInFunction |= Res.second; - Inst = Res.first; + IInSlot = Res.first; } - if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) { + if (LastInstInFunction || !SafeInSlot(*IInSlot, *I)) { MachineBasicBlock::instr_iterator Iit = I->getIterator(); if (std::next(Iit) == FI->end() || @@ -778,6 +778,29 @@ bool MipsBranchExpansion::handleForbiddenSlot() { return Changed; } +bool MipsBranchExpansion::handleForbiddenSlot() { + // Forbidden slot hazards are only defined for MIPSR6 but not microMIPSR6. + if (!STI->hasMips32r6() || STI->inMicroMipsMode()) + return false; + + return handleSlot( + [this](auto &I) -> bool { return TII->HasForbiddenSlot(I); }, + [this](auto &IInSlot, auto &I) -> bool { + return TII->SafeInForbiddenSlot(IInSlot); + }); +} + +bool MipsBranchExpansion::handleFPUDelaySlot() { + // FPU delay slots are only defined for MIPS3 and below. + if (STI->hasMips32() || STI->hasMips4()) + return false; + + return handleSlot([this](auto &I) -> bool { return TII->HasFPUDelaySlot(I); }, + [this](auto &IInSlot, auto &I) -> bool { + return TII->SafeInFPUDelaySlot(IInSlot, I); + }); +} + bool MipsBranchExpansion::handlePossibleLongBranch() { if (STI->inMips16Mode() || !STI->enableLongBranchPass()) return false; @@ -857,13 +880,16 @@ bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) { // Run these two at least once bool longBranchChanged = handlePossibleLongBranch(); bool forbiddenSlotChanged = handleForbiddenSlot(); + bool fpuDelaySlotChanged = handleFPUDelaySlot(); - bool Changed = longBranchChanged || forbiddenSlotChanged; + bool Changed = + longBranchChanged || forbiddenSlotChanged || fpuDelaySlotChanged; // Then run them alternatively while there are changes while (forbiddenSlotChanged) { longBranchChanged = handlePossibleLongBranch(); - if (!longBranchChanged) + fpuDelaySlotChanged = handleFPUDelaySlot(); + if (!longBranchChanged && !fpuDelaySlotChanged) break; forbiddenSlotChanged = handleForbiddenSlot(); } diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 4f364ef6afc7..9377e83524e1 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -4121,7 +4121,7 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 'd': // Address register. Same as 'r' unless generating MIPS16 code. case 'y': // Same as 'r'. Exists for compatibility. case 'r': - if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) { + if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8 || VT == MVT::i1) { if (Subtarget.inMips16Mode()) return std::make_pair(0U, &Mips::CPU16RegsRegClass); return std::make_pair(0U, &Mips::GPR32RegClass); diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp index 94828a976695..2bf8562895d7 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -568,11 +568,60 @@ bool MipsInstrInfo::SafeInForbiddenSlot(const MachineInstr &MI) const { return (MI.getDesc().TSFlags & MipsII::IsCTI) == 0; } +bool MipsInstrInfo::SafeInFPUDelaySlot(const MachineInstr &MIInSlot, + const MachineInstr &FPUMI) const { + if (MIInSlot.isInlineAsm()) + return false; + + if (HasFPUDelaySlot(MIInSlot)) + return false; + + switch (MIInSlot.getOpcode()) { + case Mips::BC1F: + case Mips::BC1FL: + case Mips::BC1T: + case Mips::BC1TL: + return false; + } + + for (const MachineOperand &Op : FPUMI.defs()) { + if (!Op.isReg()) + continue; + + bool Reads, Writes; + std::tie(Reads, Writes) = MIInSlot.readsWritesVirtualRegister(Op.getReg()); + + if (Reads || Writes) + return false; + } + + return true; +} + /// Predicate for distingushing instructions that have forbidden slots. bool MipsInstrInfo::HasForbiddenSlot(const MachineInstr &MI) const { return (MI.getDesc().TSFlags & MipsII::HasForbiddenSlot) != 0; } +/// Predicate for distingushing instructions that have FPU delay slots. +bool MipsInstrInfo::HasFPUDelaySlot(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case Mips::MTC1: + case Mips::MFC1: + case Mips::MTC1_D64: + case Mips::MFC1_D64: + case Mips::DMTC1: + case Mips::DMFC1: + case Mips::FCMP_S32: + case Mips::FCMP_D32: + case Mips::FCMP_D64: + return true; + + default: + return false; + } +} + /// Return the number of bytes of code the specified instruction may be. unsigned MipsInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { switch (MI.getOpcode()) { diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index c96ed202df30..46c1b73d512f 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -92,9 +92,16 @@ public: /// Predicate to determine if an instruction can go in a forbidden slot. bool SafeInForbiddenSlot(const MachineInstr &MI) const; + /// Predicate to determine if an instruction can go in an FPU delay slot. + bool SafeInFPUDelaySlot(const MachineInstr &MIInSlot, + const MachineInstr &FPUMI) const; + /// Predicate to determine if an instruction has a forbidden slot. bool HasForbiddenSlot(const MachineInstr &MI) const; + /// Predicate to determine if an instruction has an FPU delay slot. + bool HasFPUDelaySlot(const MachineInstr &MI) const; + /// Insert nop instruction when hazard condition is found void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index c35e67d6726f..16add48d4602 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -1098,10 +1098,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, O << " .attribute(.managed)"; } - if (GVar->getAlignment() == 0) - O << " .align " << (int)DL.getPrefTypeAlignment(ETy); + if (MaybeAlign A = GVar->getAlign()) + O << " .align " << A->value(); else - O << " .align " << GVar->getAlignment(); + O << " .align " << (int)DL.getPrefTypeAlignment(ETy); if (ETy->isFloatingPointTy() || ETy->isPointerTy() || (ETy->isIntegerTy() && ETy->getScalarSizeInBits() <= 64)) { @@ -1290,10 +1290,10 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, O << "."; emitPTXAddressSpace(GVar->getType()->getAddressSpace(), O); - if (GVar->getAlignment() == 0) - O << " .align " << (int)DL.getPrefTypeAlignment(ETy); + if (MaybeAlign A = GVar->getAlign()) + O << " .align " << A->value(); else - O << " .align " << GVar->getAlignment(); + O << " .align " << (int)DL.getPrefTypeAlignment(ETy); // Special case for i128 if (ETy->isIntegerTy(128)) { diff --git a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp index 1f3b4c9440d8..bf3c87df2e08 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -126,9 +126,9 @@ static void CombineCVTAToLocal(MachineInstr &Root) { // Check if MRI has only one non dbg use, which is Root if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) { - Prev.eraseFromParentAndMarkDBGValuesForRemoval(); + Prev.eraseFromParent(); } - Root.eraseFromParentAndMarkDBGValuesForRemoval(); + Root.eraseFromParent(); } bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { @@ -157,7 +157,7 @@ bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) { const auto &MRI = MF.getRegInfo(); if (MRI.use_empty(NRI->getFrameRegister(MF))) { if (auto MI = MRI.getUniqueVRegDef(NRI->getFrameRegister(MF))) { - MI->eraseFromParentAndMarkDBGValuesForRemoval(); + MI->eraseFromParent(); } } diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index 9e181d4052d6..ded922329ebf 100644 --- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -1576,6 +1576,16 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, std::swap(Operands[2], Operands[1]); } + // Handle base mnemonic for atomic loads where the EH bit is zero. + if (Name == "lqarx" || Name == "ldarx" || Name == "lwarx" || + Name == "lharx" || Name == "lbarx") { + if (Operands.size() != 5) + return false; + PPCOperand &EHOp = (PPCOperand &)*Operands[4]; + if (EHOp.isU1Imm() && EHOp.getImm() == 0) + Operands.pop_back(); + } + return false; } @@ -1745,7 +1755,7 @@ unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, } PPCOperand &Op = static_cast<PPCOperand &>(AsmOp); - if (Op.isImm() && Op.getImm() == ImmVal) + if (Op.isU3Imm() && Op.getImm() == ImmVal) return Match_Success; return Match_InvalidOperand; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp index 22b948a83c34..d6e02d0d0862 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp @@ -28,6 +28,7 @@ #include "llvm/MC/MCDwarf.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" @@ -368,6 +369,31 @@ static MCInstPrinter *createPPCMCInstPrinter(const Triple &T, return new PPCInstPrinter(MAI, MII, MRI, T); } +namespace { + +class PPCMCInstrAnalysis : public MCInstrAnalysis { +public: + explicit PPCMCInstrAnalysis(const MCInstrInfo *Info) + : MCInstrAnalysis(Info) {} + + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, + uint64_t &Target) const override { + unsigned NumOps = Inst.getNumOperands(); + if (NumOps == 0 || + Info->get(Inst.getOpcode()).OpInfo[NumOps - 1].OperandType != + MCOI::OPERAND_PCREL) + return false; + Target = Addr + Inst.getOperand(NumOps - 1).getImm() * Size; + return true; + } +}; + +} // end anonymous namespace + +static MCInstrAnalysis *createPPCMCInstrAnalysis(const MCInstrInfo *Info) { + return new PPCMCInstrAnalysis(Info); +} + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() { for (Target *T : {&getThePPC32Target(), &getThePPC32LETarget(), &getThePPC64Target(), &getThePPC64LETarget()}) { @@ -383,6 +409,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() { // Register the MC subtarget info. TargetRegistry::RegisterMCSubtargetInfo(*T, createPPCMCSubtargetInfo); + // Register the MC instruction analyzer. + TargetRegistry::RegisterMCInstrAnalysis(*T, createPPCMCInstrAnalysis); + // Register the MC Code Emitter TargetRegistry::RegisterMCCodeEmitter(*T, createPPCMCCodeEmitter); diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index 422bd11dca52..bbd5f5fd1941 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -219,6 +219,10 @@ def FeatureZeroMoveFusion: SubtargetFeature<"fuse-zeromove", "HasZeroMoveFusion", "true", "Target supports move to SPR with branch fusion", [FeatureFusion]>; +def FeatureBack2BackFusion: + SubtargetFeature<"fuse-back2back", "HasBack2BackFusion", "true", + "Target supports general back to back fusion", + [FeatureFusion]>; def FeatureUnalignedFloats : SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess", "true", "CPU does not trap on unaligned FP access">; diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 16e3b2b85c2e..f26c15667a0b 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -347,7 +347,6 @@ bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, // At the moment, all inline asm memory operands are a single register. // In any case, the output of this routine should always be just one // assembler operand. - bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) { diff --git a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def new file mode 100644 index 000000000000..38ed5f2e78e3 --- /dev/null +++ b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def @@ -0,0 +1,1042 @@ +// Automatically generated file, do not edit! +// +// This file defines instruction list for general back2back fusion. +//===----------------------------------------------------------------------===// +FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1, + FUSION_OP_SET(ADD4, + ADD4O, + ADD4TLS, + ADD4_rec, + ADD8, + ADD8O, + ADD8TLS, + ADD8TLS_, + ADD8_rec, + ADDE, + ADDE8, + ADDE8O, + ADDEO, + ADDEX, + ADDEX8, + ADDI, + ADDI8, + ADDIC, + ADDIC8, + ADDIS, + ADDIS8, + ADDISdtprelHA32, + ADDIStocHA, + ADDIStocHA8, + ADDIdtprelL32, + ADDItlsldLADDR32, + ADDItocL, + ADDME, + ADDME8, + ADDME8O, + ADDMEO, + ADDZE, + ADDZE8, + ADDZE8O, + ADDZEO, + AND, + AND8, + AND8_rec, + ANDC, + ANDC8, + ANDC8_rec, + ANDC_rec, + ANDI8_rec, + ANDIS8_rec, + ANDIS_rec, + ANDI_rec, + AND_rec, + CMPB, + CMPB8, + CNTLZD, + CNTLZD_rec, + CNTLZW, + CNTLZW8, + CNTLZW8_rec, + CNTLZW_rec, + CNTTZD, + CNTTZD_rec, + CNTTZW, + CNTTZW8, + CNTTZW8_rec, + CNTTZW_rec, + EQV, + EQV8, + EQV8_rec, + EQV_rec, + EXTSB, + EXTSB8, + EXTSB8_32_64, + EXTSB8_rec, + EXTSB_rec, + EXTSH, + EXTSH8, + EXTSH8_32_64, + EXTSH8_rec, + EXTSH_rec, + EXTSW, + EXTSWSLI, + EXTSWSLI_32_64, + EXTSWSLI_32_64_rec, + EXTSWSLI_rec, + EXTSW_32, + EXTSW_32_64, + EXTSW_32_64_rec, + EXTSW_rec, + FABSD, + FABSS, + FCPSGND, + FCPSGNS, + FMR, + FNABSD, + FNABSS, + FNEGD, + FNEGS, + ISEL, + ISEL8, + LI, + LI8, + LIS, + LIS8, + MFCTR, + MFCTR8, + MFLR, + MFLR8, + MFOCRF, + MFOCRF8, + MFVRD, + MFVRWZ, + MFVSRD, + MFVSRWZ, + MTVRD, + MTVRWA, + MTVRWZ, + MTVSRBM, + MTVSRBMI, + MTVSRD, + MTVSRDM, + MTVSRHM, + MTVSRQM, + MTVSRWA, + MTVSRWM, + MTVSRWZ, + NAND, + NAND8, + NAND8_rec, + NAND_rec, + NEG, + NEG8, + NEG8O, + NEG8_rec, + NEGO, + NEG_rec, + NOP, + NOP_GT_PWR6, + NOP_GT_PWR7, + NOR, + NOR8, + NOR8_rec, + NOR_rec, + OR, + OR8, + OR8_rec, + ORC, + ORC8, + ORC8_rec, + ORC_rec, + ORI, + ORI8, + ORIS, + ORIS8, + OR_rec, + POPCNTB, + POPCNTB8, + POPCNTD, + POPCNTW, + RLDCL, + RLDCL_rec, + RLDCR, + RLDCR_rec, + RLDIC, + RLDICL, + RLDICL_32, + RLDICL_32_64, + RLDICL_32_rec, + RLDICL_rec, + RLDICR, + RLDICR_32, + RLDICR_rec, + RLDIC_rec, + RLDIMI, + RLDIMI_rec, + RLWIMI, + RLWIMI8, + RLWIMI8_rec, + RLWIMI_rec, + RLWINM, + RLWINM8, + RLWINM8_rec, + RLWINM_rec, + RLWNM, + RLWNM8, + RLWNM8_rec, + RLWNM_rec, + SETB, + SETB8, + SETBC, + SETBC8, + SETBCR, + SETBCR8, + SETNBC, + SETNBC8, + SETNBCR, + SETNBCR8, + SLD, + SLD_rec, + SLW, + SLW8, + SLW8_rec, + SLW_rec, + SRAD, + SRADI, + SRADI_32, + SRAW, + SRAWI, + SRD, + SRD_rec, + SRW, + SRW8, + SRW8_rec, + SRW_rec, + SUBF, + SUBF8, + SUBF8O, + SUBF8_rec, + SUBFE, + SUBFE8, + SUBFE8O, + SUBFEO, + SUBFIC, + SUBFIC8, + SUBFME, + SUBFME8, + SUBFME8O, + SUBFMEO, + SUBFO, + SUBFZE, + SUBFZE8, + SUBFZE8O, + SUBFZEO, + SUBF_rec, + VABSDUB, + VABSDUH, + VABSDUW, + VADDCUW, + VADDSBS, + VADDSHS, + VADDSWS, + VADDUBM, + VADDUBS, + VADDUDM, + VADDUHM, + VADDUHS, + VADDUWM, + VADDUWS, + VAND, + VANDC, + VAVGSB, + VAVGSH, + VAVGSW, + VAVGUB, + VAVGUH, + VAVGUW, + VCLZB, + VCLZD, + VCLZH, + VCLZW, + VCMPBFP, + VCMPBFP_rec, + VCMPEQFP, + VCMPEQFP_rec, + VCMPEQUB, + VCMPEQUB_rec, + VCMPEQUD, + VCMPEQUD_rec, + VCMPEQUH, + VCMPEQUH_rec, + VCMPEQUQ, + VCMPEQUQ_rec, + VCMPEQUW, + VCMPEQUW_rec, + VCMPGEFP, + VCMPGEFP_rec, + VCMPGTFP, + VCMPGTFP_rec, + VCMPGTSB, + VCMPGTSB_rec, + VCMPGTSD, + VCMPGTSD_rec, + VCMPGTSH, + VCMPGTSH_rec, + VCMPGTSQ, + VCMPGTSQ_rec, + VCMPGTSW, + VCMPGTSW_rec, + VCMPGTUB, + VCMPGTUB_rec, + VCMPGTUD, + VCMPGTUD_rec, + VCMPGTUH, + VCMPGTUH_rec, + VCMPGTUQ, + VCMPGTUQ_rec, + VCMPGTUW, + VCMPGTUW_rec, + VCMPNEB, + VCMPNEB_rec, + VCMPNEH, + VCMPNEH_rec, + VCMPNEW, + VCMPNEW_rec, + VCMPNEZB, + VCMPNEZB_rec, + VCMPNEZH, + VCMPNEZH_rec, + VCMPNEZW, + VCMPNEZW_rec, + VCNTMBB, + VCNTMBD, + VCNTMBH, + VCNTMBW, + VCTZB, + VCTZD, + VCTZH, + VCTZW, + VEQV, + VEXPANDBM, + VEXPANDDM, + VEXPANDHM, + VEXPANDQM, + VEXPANDWM, + VEXTRACTBM, + VEXTRACTDM, + VEXTRACTHM, + VEXTRACTQM, + VEXTRACTWM, + VEXTSB2D, + VEXTSB2Ds, + VEXTSB2W, + VEXTSB2Ws, + VEXTSD2Q, + VEXTSH2D, + VEXTSH2Ds, + VEXTSH2W, + VEXTSH2Ws, + VEXTSW2D, + VEXTSW2Ds, + VMAXFP, + VMAXSB, + VMAXSD, + VMAXSH, + VMAXSW, + VMAXUB, + VMAXUD, + VMAXUH, + VMAXUW, + VMINFP, + VMINSB, + VMINSD, + VMINSH, + VMINSW, + VMINUB, + VMINUD, + VMINUH, + VMINUW, + VMRGEW, + VMRGOW, + VNAND, + VNEGD, + VNEGW, + VNOR, + VOR, + VORC, + VPOPCNTB, + VPOPCNTD, + VPOPCNTH, + VPOPCNTW, + VPRTYBD, + VPRTYBW, + VRLB, + VRLD, + VRLDMI, + VRLDNM, + VRLH, + VRLW, + VRLWMI, + VRLWNM, + VSEL, + VSHASIGMAD, + VSHASIGMAW, + VSLB, + VSLD, + VSLH, + VSLW, + VSRAB, + VSRAD, + VSRAH, + VSRAW, + VSRB, + VSRD, + VSRH, + VSRW, + VSUBCUW, + VSUBSBS, + VSUBSHS, + VSUBSWS, + VSUBUBM, + VSUBUBS, + VSUBUDM, + VSUBUHM, + VSUBUHS, + VSUBUWM, + VSUBUWS, + VXOR, + V_SET0, + V_SET0B, + V_SET0H, + XOR, + XOR8, + XOR8_rec, + XORI, + XORI8, + XORIS, + XORIS8, + XOR_rec, + XSABSDP, + XSABSQP, + XSCMPEQDP, + XSCMPGEDP, + XSCMPGTDP, + XSCPSGNDP, + XSCPSGNQP, + XSCVHPDP, + XSCVSPDPN, + XSIEXPDP, + XSIEXPQP, + XSMAXCDP, + XSMAXDP, + XSMAXJDP, + XSMINCDP, + XSMINDP, + XSMINJDP, + XSNABSDP, + XSNABSQP, + XSNEGDP, + XSNEGQP, + XSXEXPDP, + XSXEXPQP, + XSXSIGDP, + XVABSDP, + XVABSSP, + XVCMPEQDP, + XVCMPEQDP_rec, + XVCMPEQSP, + XVCMPEQSP_rec, + XVCMPGEDP, + XVCMPGEDP_rec, + XVCMPGESP, + XVCMPGESP_rec, + XVCMPGTDP, + XVCMPGTDP_rec, + XVCMPGTSP, + XVCMPGTSP_rec, + XVCPSGNDP, + XVCPSGNSP, + XVCVHPSP, + XVIEXPDP, + XVIEXPSP, + XVMAXDP, + XVMAXSP, + XVMINDP, + XVMINSP, + XVNABSDP, + XVNABSSP, + XVNEGDP, + XVNEGSP, + XVTSTDCDP, + XVTSTDCSP, + XVXEXPDP, + XVXEXPSP, + XVXSIGDP, + XVXSIGSP, + XXLAND, + XXLANDC, + XXLEQV, + XXLEQVOnes, + XXLNAND, + XXLNOR, + XXLOR, + XXLORC, + XXLORf, + XXLXOR, + XXLXORdpz, + XXLXORspz, + XXLXORz, + XXSEL), + FUSION_OP_SET(ADD4, + ADD4O, + ADD4TLS, + ADD4_rec, + ADD8, + ADD8O, + ADD8TLS, + ADD8TLS_, + ADD8_rec, + ADDE, + ADDE8, + ADDE8O, + ADDEO, + ADDEX, + ADDEX8, + ADDI, + ADDI8, + ADDIC, + ADDIC8, + ADDIS, + ADDIS8, + ADDISdtprelHA32, + ADDIStocHA, + ADDIStocHA8, + ADDIdtprelL32, + ADDItlsldLADDR32, + ADDItocL, + ADDME, + ADDME8, + ADDME8O, + ADDMEO, + ADDZE, + ADDZE8, + ADDZE8O, + ADDZEO, + AND, + AND8, + AND8_rec, + ANDC, + ANDC8, + ANDC8_rec, + ANDC_rec, + ANDI8_rec, + ANDIS8_rec, + ANDIS_rec, + ANDI_rec, + AND_rec, + CMPB, + CMPB8, + CMPD, + CMPDI, + CMPEQB, + CMPLD, + CMPLDI, + CMPLW, + CMPLWI, + CMPRB, + CMPRB8, + CMPW, + CMPWI, + CNTLZD, + CNTLZD_rec, + CNTLZW, + CNTLZW8, + CNTLZW8_rec, + CNTLZW_rec, + CNTTZD, + CNTTZD_rec, + CNTTZW, + CNTTZW8, + CNTTZW8_rec, + CNTTZW_rec, + CR6SET, + CR6UNSET, + CRAND, + CRANDC, + CREQV, + CRNAND, + CRNOR, + CROR, + CRORC, + CRSET, + CRUNSET, + CRXOR, + DSS, + DSSALL, + DST, + DST64, + DSTST, + DSTST64, + DSTSTT, + DSTSTT64, + DSTT, + DSTT64, + EQV, + EQV8, + EQV8_rec, + EQV_rec, + EXTSB, + EXTSB8, + EXTSB8_32_64, + EXTSB8_rec, + EXTSB_rec, + EXTSH, + EXTSH8, + EXTSH8_32_64, + EXTSH8_rec, + EXTSH_rec, + EXTSW, + EXTSWSLI, + EXTSWSLI_32_64, + EXTSWSLI_32_64_rec, + EXTSWSLI_rec, + EXTSW_32, + EXTSW_32_64, + EXTSW_32_64_rec, + EXTSW_rec, + FABSD, + FABSS, + FCMPOD, + FCMPOS, + FCMPUD, + FCMPUS, + FCPSGND, + FCPSGNS, + FMR, + FNABSD, + FNABSS, + FNEGD, + FNEGS, + FTDIV, + FTSQRT, + ISEL, + ISEL8, + LI, + LI8, + LIS, + LIS8, + MCRF, + MCRXRX, + MFCTR, + MFCTR8, + MFLR, + MFLR8, + MFOCRF, + MFOCRF8, + MFVRD, + MFVRWZ, + MFVSRD, + MFVSRWZ, + MTCTR, + MTCTR8, + MTCTR8loop, + MTCTRloop, + MTLR, + MTLR8, + MTOCRF, + MTOCRF8, + MTVRD, + MTVRWA, + MTVRWZ, + MTVSRBM, + MTVSRBMI, + MTVSRD, + MTVSRDM, + MTVSRHM, + MTVSRQM, + MTVSRWA, + MTVSRWM, + MTVSRWZ, + NAND, + NAND8, + NAND8_rec, + NAND_rec, + NEG, + NEG8, + NEG8O, + NEG8_rec, + NEGO, + NEG_rec, + NOP, + NOP_GT_PWR6, + NOP_GT_PWR7, + NOR, + NOR8, + NOR8_rec, + NOR_rec, + OR, + OR8, + OR8_rec, + ORC, + ORC8, + ORC8_rec, + ORC_rec, + ORI, + ORI8, + ORIS, + ORIS8, + OR_rec, + POPCNTB, + POPCNTB8, + POPCNTD, + POPCNTW, + RLDCL, + RLDCL_rec, + RLDCR, + RLDCR_rec, + RLDIC, + RLDICL, + RLDICL_32, + RLDICL_32_64, + RLDICL_32_rec, + RLDICL_rec, + RLDICR, + RLDICR_32, + RLDICR_rec, + RLDIC_rec, + RLDIMI, + RLDIMI_rec, + RLWIMI, + RLWIMI8, + RLWIMI8_rec, + RLWIMI_rec, + RLWINM, + RLWINM8, + RLWINM8_rec, + RLWINM_rec, + RLWNM, + RLWNM8, + RLWNM8_rec, + RLWNM_rec, + SETB, + SETB8, + SETBC, + SETBC8, + SETBCR, + SETBCR8, + SETNBC, + SETNBC8, + SETNBCR, + SETNBCR8, + SLD, + SLD_rec, + SLW, + SLW8, + SLW8_rec, + SLW_rec, + SRAD, + SRADI, + SRADI_32, + SRAW, + SRAWI, + SRD, + SRD_rec, + SRW, + SRW8, + SRW8_rec, + SRW_rec, + SUBF, + SUBF8, + SUBF8O, + SUBF8_rec, + SUBFE, + SUBFE8, + SUBFE8O, + SUBFEO, + SUBFIC, + SUBFIC8, + SUBFME, + SUBFME8, + SUBFME8O, + SUBFMEO, + SUBFO, + SUBFZE, + SUBFZE8, + SUBFZE8O, + SUBFZEO, + SUBF_rec, + TD, + TDI, + TRAP, + TW, + TWI, + VABSDUB, + VABSDUH, + VABSDUW, + VADDCUW, + VADDSBS, + VADDSHS, + VADDSWS, + VADDUBM, + VADDUBS, + VADDUDM, + VADDUHM, + VADDUHS, + VADDUWM, + VADDUWS, + VAND, + VANDC, + VAVGSB, + VAVGSH, + VAVGSW, + VAVGUB, + VAVGUH, + VAVGUW, + VCLZB, + VCLZD, + VCLZH, + VCLZW, + VCMPBFP, + VCMPBFP_rec, + VCMPEQFP, + VCMPEQFP_rec, + VCMPEQUB, + VCMPEQUB_rec, + VCMPEQUD, + VCMPEQUD_rec, + VCMPEQUH, + VCMPEQUH_rec, + VCMPEQUQ, + VCMPEQUQ_rec, + VCMPEQUW, + VCMPEQUW_rec, + VCMPGEFP, + VCMPGEFP_rec, + VCMPGTFP, + VCMPGTFP_rec, + VCMPGTSB, + VCMPGTSB_rec, + VCMPGTSD, + VCMPGTSD_rec, + VCMPGTSH, + VCMPGTSH_rec, + VCMPGTSQ, + VCMPGTSQ_rec, + VCMPGTSW, + VCMPGTSW_rec, + VCMPGTUB, + VCMPGTUB_rec, + VCMPGTUD, + VCMPGTUD_rec, + VCMPGTUH, + VCMPGTUH_rec, + VCMPGTUQ, + VCMPGTUQ_rec, + VCMPGTUW, + VCMPGTUW_rec, + VCMPNEB, + VCMPNEB_rec, + VCMPNEH, + VCMPNEH_rec, + VCMPNEW, + VCMPNEW_rec, + VCMPNEZB, + VCMPNEZB_rec, + VCMPNEZH, + VCMPNEZH_rec, + VCMPNEZW, + VCMPNEZW_rec, + VCMPSQ, + VCMPUQ, + VCNTMBB, + VCNTMBD, + VCNTMBH, + VCNTMBW, + VCTZB, + VCTZD, + VCTZH, + VCTZW, + VEQV, + VEXPANDBM, + VEXPANDDM, + VEXPANDHM, + VEXPANDQM, + VEXPANDWM, + VEXTRACTBM, + VEXTRACTDM, + VEXTRACTHM, + VEXTRACTQM, + VEXTRACTWM, + VEXTSB2D, + VEXTSB2Ds, + VEXTSB2W, + VEXTSB2Ws, + VEXTSD2Q, + VEXTSH2D, + VEXTSH2Ds, + VEXTSH2W, + VEXTSH2Ws, + VEXTSW2D, + VEXTSW2Ds, + VMAXFP, + VMAXSB, + VMAXSD, + VMAXSH, + VMAXSW, + VMAXUB, + VMAXUD, + VMAXUH, + VMAXUW, + VMINFP, + VMINSB, + VMINSD, + VMINSH, + VMINSW, + VMINUB, + VMINUD, + VMINUH, + VMINUW, + VMRGEW, + VMRGOW, + VNAND, + VNEGD, + VNEGW, + VNOR, + VOR, + VORC, + VPOPCNTB, + VPOPCNTD, + VPOPCNTH, + VPOPCNTW, + VPRTYBD, + VPRTYBW, + VRLB, + VRLD, + VRLDMI, + VRLDNM, + VRLH, + VRLW, + VRLWMI, + VRLWNM, + VSEL, + VSHASIGMAD, + VSHASIGMAW, + VSLB, + VSLD, + VSLH, + VSLW, + VSRAB, + VSRAD, + VSRAH, + VSRAW, + VSRB, + VSRD, + VSRH, + VSRW, + VSUBCUW, + VSUBSBS, + VSUBSHS, + VSUBSWS, + VSUBUBM, + VSUBUBS, + VSUBUDM, + VSUBUHM, + VSUBUHS, + VSUBUWM, + VSUBUWS, + VXOR, + V_SET0, + V_SET0B, + V_SET0H, + WAIT, + XOR, + XOR8, + XOR8_rec, + XORI, + XORI8, + XORIS, + XORIS8, + XOR_rec, + XSABSDP, + XSABSQP, + XSCMPEQDP, + XSCMPEXPDP, + XSCMPGEDP, + XSCMPGTDP, + XSCMPODP, + XSCMPUDP, + XSCPSGNDP, + XSCPSGNQP, + XSCVHPDP, + XSCVSPDPN, + XSIEXPDP, + XSIEXPQP, + XSMAXCDP, + XSMAXDP, + XSMAXJDP, + XSMINCDP, + XSMINDP, + XSMINJDP, + XSNABSDP, + XSNABSQP, + XSNEGDP, + XSNEGQP, + XSTDIVDP, + XSTSQRTDP, + XSTSTDCDP, + XSTSTDCSP, + XSXEXPDP, + XSXEXPQP, + XSXSIGDP, + XVABSDP, + XVABSSP, + XVCMPEQDP, + XVCMPEQDP_rec, + XVCMPEQSP, + XVCMPEQSP_rec, + XVCMPGEDP, + XVCMPGEDP_rec, + XVCMPGESP, + XVCMPGESP_rec, + XVCMPGTDP, + XVCMPGTDP_rec, + XVCMPGTSP, + XVCMPGTSP_rec, + XVCPSGNDP, + XVCPSGNSP, + XVCVHPSP, + XVIEXPDP, + XVIEXPSP, + XVMAXDP, + XVMAXSP, + XVMINDP, + XVMINSP, + XVNABSDP, + XVNABSSP, + XVNEGDP, + XVNEGSP, + XVTDIVDP, + XVTDIVSP, + XVTLSBB, + XVTSQRTDP, + XVTSQRTSP, + XVTSTDCDP, + XVTSTDCSP, + XVXEXPDP, + XVXEXPSP, + XVXSIGDP, + XVXSIGSP, + XXLAND, + XXLANDC, + XXLEQV, + XXLEQVOnes, + XXLNAND, + XXLNOR, + XXLOR, + XXLORC, + XXLORf, + XXLXOR, + XXLXORdpz, + XXLXORspz, + XXLXORz, + XXSEL))
\ No newline at end of file diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index a2664bcff4ab..ba74af5ef5f7 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -4464,9 +4464,10 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const { LoadSDNode *LDN = dyn_cast<LoadSDNode>(N); StoreSDNode *STN = dyn_cast<StoreSDNode>(N); + MemIntrinsicSDNode *MIN = dyn_cast<MemIntrinsicSDNode>(N); SDValue AddrOp; - if (LDN) - AddrOp = LDN->getOperand(1); + if (LDN || (MIN && MIN->getOpcode() == PPCISD::LD_SPLAT)) + AddrOp = N->getOperand(1); else if (STN) AddrOp = STN->getOperand(2); @@ -5973,6 +5974,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (Type != MVT::v16i8 && Type != MVT::v8i16) break; + // If the alignment for the load is 16 or bigger, we don't need the + // permutated mask to get the required value. The value must be the 0 + // element in big endian target or 7/15 in little endian target in the + // result vsx register of lvx instruction. + // Select the instruction in the .td file. + if (cast<MemIntrinsicSDNode>(N)->getAlign() >= Align(16) && + isOffsetMultipleOf(N, 16)) + break; + SDValue ZeroReg = CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO, Subtarget->isPPC64() ? MVT::i64 : MVT::i32); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ec7e30d7e362..8d6edf07bc53 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -3500,15 +3500,16 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (LHS.getValueType() == MVT::v2i64) { // Equality can be handled by casting to the legal type for Altivec // comparisons, everything else needs to be expanded. - if (CC == ISD::SETEQ || CC == ISD::SETNE) { - return DAG.getNode( - ISD::BITCAST, dl, MVT::v2i64, - DAG.getSetCC(dl, MVT::v4i32, - DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS), - DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC)); - } - - return SDValue(); + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return SDValue(); + SDValue SetCC32 = DAG.getSetCC( + dl, MVT::v4i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS), + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC); + int ShuffV[] = {1, 0, 3, 2}; + SDValue Shuff = + DAG.getVectorShuffle(MVT::v4i32, dl, SetCC32, SetCC32, ShuffV); + return DAG.getBitcast( + MVT::v2i64, DAG.getNode(ISD::AND, dl, MVT::v4i32, Shuff, SetCC32)); } // We handle most of these in the usual way. @@ -6206,20 +6207,13 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( ArgOffset += PtrByteSize; continue; } - // Copy entire object into memory. There are cases where gcc-generated - // code assumes it is there, even if it could be put entirely into - // registers. (This is not what the doc says.) - - // FIXME: The above statement is likely due to a misunderstanding of the - // documents. All arguments must be copied into the parameter area BY - // THE CALLEE in the event that the callee takes the address of any - // formal argument. That has not yet been implemented. However, it is - // reasonable to use the stack area as a staging area for the register - // load. - - // Skip this for small aggregates, as we will use the same slot for a - // right-justified copy, below. - if (Size >= 8) + // Copy the object to parameter save area if it can not be entirely passed + // by registers. + // FIXME: we only need to copy the parts which need to be passed in + // parameter save area. For the parts passed by registers, we don't need + // to copy them to the stack although we need to allocate space for them + // in parameter save area. + if ((NumGPRs - GPR_idx) * PtrByteSize < Size) Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff, CallSeqStart, Flags, DAG, dl); @@ -17548,14 +17542,14 @@ unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N, if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) || (ParentOp == ISD::INTRINSIC_VOID))) { unsigned ID = cast<ConstantSDNode>(Parent->getOperand(1))->getZExtValue(); - assert( - ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) && - "Only the paired load and store (lxvp/stxvp) intrinsics are valid."); - SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp) ? Parent->getOperand(2) - : Parent->getOperand(3); - computeFlagsForAddressComputation(IntrinOp, FlagSet, DAG); - FlagSet |= PPC::MOF_Vector; - return FlagSet; + if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) { + SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp) + ? Parent->getOperand(2) + : Parent->getOperand(3); + computeFlagsForAddressComputation(IntrinOp, FlagSet, DAG); + FlagSet |= PPC::MOF_Vector; + return FlagSet; + } } // Mark this as something we don't want to handle here if it is atomic diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 2cfd53de3290..c16e146da247 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -393,7 +393,9 @@ public: MachineInstr &NewMI1, MachineInstr &NewMI2) const override; - void setSpecialOperandAttr(MachineInstr &MI, uint16_t Flags) const override; + // PowerPC specific version of setSpecialOperandAttr that copies Flags to MI + // and clears nuw, nsw, and exact flags. + void setSpecialOperandAttr(MachineInstr &MI, uint16_t Flags) const; bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index d83ecc699b19..2340be5b5915 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -4780,6 +4780,7 @@ class PPCAsmPseudo<string asm, dag iops> def : InstAlias<"sc", (SC 0)>; def : InstAlias<"sync", (SYNC 0)>, Requires<[HasSYNC]>; +def : InstAlias<"hwsync", (SYNC 0), 0>, Requires<[HasSYNC]>; def : InstAlias<"msync", (SYNC 0), 0>, Requires<[HasSYNC]>; def : InstAlias<"lwsync", (SYNC 1)>, Requires<[HasSYNC]>; def : InstAlias<"ptesync", (SYNC 2)>, Requires<[HasSYNC]>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index d92a10c5b208..110f7d79fbc5 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -158,6 +158,11 @@ def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">; def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">; def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">; +def PPCldsplatAlign16 : PatFrag<(ops node:$ptr), (PPCldsplat node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getAlign() >= Align(16) && + isOffsetMultipleOf(N, 16); +}]>; + //--------------------- VSX-specific instruction formats ---------------------// // By default, all VSX instructions are to be selected over their Altivec // counter parts and they do not have unmodeled sideeffects. @@ -3180,6 +3185,12 @@ defm : ScalToVecWPermute< v2f64, (f64 (load ForceXForm:$src)), (XXPERMDIs (XFLOADf64 ForceXForm:$src), 2), (SUBREG_TO_REG (i64 1), (XFLOADf64 ForceXForm:$src), sub_64)>; + +// Splat loads. +def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)), + (v8i16 (VSPLTH 7, (LVX ForceXForm:$A)))>; +def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)), + (v16i8 (VSPLTB 15, (LVX ForceXForm:$A)))>; } // HasVSX, NoP9Vector, IsLittleEndian let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in { @@ -3187,6 +3198,12 @@ let Predicates = [HasVSX, NoP9Vector, IsBigEndian] in { (LXVD2X ForceXForm:$src)>; def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, ForceXForm:$dst), (STXVD2X $rS, ForceXForm:$dst)>; + + // Splat loads. + def : Pat<(v8i16 (PPCldsplatAlign16 ForceXForm:$A)), + (v8i16 (VSPLTH 0, (LVX ForceXForm:$A)))>; + def : Pat<(v16i8 (PPCldsplatAlign16 ForceXForm:$A)), + (v16i8 (VSPLTB 0, (LVX ForceXForm:$A)))>; } // HasVSX, NoP9Vector, IsBigEndian // Any VSX subtarget that only has loads and stores that load in big endian diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index 7f63827afbd6..0c7be96a0595 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -413,9 +413,9 @@ bool PPCLoopInstrFormPrep::runOnFunction(Function &F) { bool MadeChange = false; - for (auto I = LI->begin(), IE = LI->end(); I != IE; ++I) - for (auto L = df_begin(*I), LE = df_end(*I); L != LE; ++L) - MadeChange |= runOnLoop(*L); + for (Loop *I : *LI) + for (Loop *L : depth_first(I)) + MadeChange |= runOnLoop(L); return MadeChange; } diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def index e4954b722fd0..6b8ad22639c8 100644 --- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def +++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def @@ -153,5 +153,7 @@ FUSION_FEATURE(ZeroMoveLR, hasZeroMoveFusion, -1, FUSION_OP_SET(MTLR8, MTLR, MTSPR8, MTSPR), FUSION_OP_SET(BCLR, BCLRn, gBCLR, BCLRL, BCLRLn, gBCLRL)) +#include "PPCBack2BackFusion.def" + #undef FUSION_FEATURE #undef FUSION_OP_SET diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 1258a1281597..f11b4e14073e 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -135,6 +135,7 @@ void PPCSubtarget::initializeEnvironment() { HasCompareFusion = false; HasWideImmFusion = false; HasZeroMoveFusion = false; + HasBack2BackFusion = false; IsISA2_06 = false; IsISA2_07 = false; IsISA3_0 = false; diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index d52833cb1465..1300b62b623a 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -155,6 +155,7 @@ protected: bool HasCompareFusion; bool HasWideImmFusion; bool HasZeroMoveFusion; + bool HasBack2BackFusion; bool IsISA2_06; bool IsISA2_07; bool IsISA3_0; @@ -348,6 +349,7 @@ public: bool hasWideImmFusion() const { return HasWideImmFusion; } bool hasSha3Fusion() const { return HasSha3Fusion; } bool hasZeroMoveFusion() const { return HasZeroMoveFusion; } + bool hasBack2BackFusion() const { return HasBack2BackFusion; } bool needsSwapsForVSXMemOps() const { return hasVSX() && isLittleEndian() && !hasP9Vector(); } diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 5d6f58a77a39..ed28731b8ef2 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -328,10 +328,6 @@ static bool isMMAType(Type *Ty) { InstructionCost PPCTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands, TTI::TargetCostKind CostKind) { - // Set the max cost if an MMA type is present (v256i1, v512i1). - if (isMMAType(U->getType())) - return InstructionCost::getMax(); - // We already implement getCastInstrCost and getMemoryOpCost where we perform // the vector adjustment there. if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U)) @@ -1276,23 +1272,21 @@ PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, return BaseT::getIntrinsicInstrCost(ICA, CostKind); } -bool PPCTTIImpl::areFunctionArgsABICompatible( - const Function *Caller, const Function *Callee, - SmallPtrSetImpl<Argument *> &Args) const { +bool PPCTTIImpl::areTypesABICompatible(const Function *Caller, + const Function *Callee, + const ArrayRef<Type *> &Types) const { // We need to ensure that argument promotion does not // attempt to promote pointers to MMA types (__vector_pair // and __vector_quad) since these types explicitly cannot be // passed as arguments. Both of these types are larger than // the 128-bit Altivec vectors and have a scalar size of 1 bit. - if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) + if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) return false; - return llvm::none_of(Args, [](Argument *A) { - auto *EltTy = cast<PointerType>(A->getType())->getElementType(); - if (EltTy->isSized()) - return (EltTy->isIntOrIntVectorTy(1) && - EltTy->getPrimitiveSizeInBits() > 128); + return llvm::none_of(Types, [](Type *Ty) { + if (Ty->isSized()) + return Ty->isIntOrIntVectorTy(1) && Ty->getPrimitiveSizeInBits() > 128; return false; }); } @@ -1388,3 +1382,86 @@ bool PPCTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, return false; } + +bool PPCTTIImpl::hasActiveVectorLength(unsigned Opcode, Type *DataType, + Align Alignment) const { + // Only load and stores instructions can have variable vector length on Power. + if (Opcode != Instruction::Load && Opcode != Instruction::Store) + return false; + // Loads/stores with length instructions use bits 0-7 of the GPR operand and + // therefore cannot be used in 32-bit mode. + if ((!ST->hasP9Vector() && !ST->hasP10Vector()) || !ST->isPPC64()) + return false; + if (isa<FixedVectorType>(DataType)) { + unsigned VecWidth = DataType->getPrimitiveSizeInBits(); + return VecWidth == 128; + } + Type *ScalarTy = DataType->getScalarType(); + + if (ScalarTy->isPointerTy()) + return true; + + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 8 || IntWidth == 16 || IntWidth == 32 || IntWidth == 64; +} + +InstructionCost PPCTTIImpl::getVPMemoryOpCost(unsigned Opcode, Type *Src, + Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I) { + InstructionCost Cost = BaseT::getVPMemoryOpCost(Opcode, Src, Alignment, + AddressSpace, CostKind, I); + if (TLI->getValueType(DL, Src, true) == MVT::Other) + return Cost; + // TODO: Handle other cost kinds. + if (CostKind != TTI::TCK_RecipThroughput) + return Cost; + + assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && + "Invalid Opcode"); + + auto *SrcVTy = dyn_cast<FixedVectorType>(Src); + assert(SrcVTy && "Expected a vector type for VP memory operations"); + + if (hasActiveVectorLength(Opcode, Src, Alignment)) { + std::pair<InstructionCost, MVT> LT = + TLI->getTypeLegalizationCost(DL, SrcVTy); + + InstructionCost CostFactor = + vectorCostAdjustmentFactor(Opcode, Src, nullptr); + if (!CostFactor.isValid()) + return InstructionCost::getMax(); + + InstructionCost Cost = LT.first * CostFactor; + assert(Cost.isValid() && "Expected valid cost"); + + // On P9 but not on P10, if the op is misaligned then it will cause a + // pipeline flush. Otherwise the VSX masked memops cost the same as unmasked + // ones. + const Align DesiredAlignment(16); + if (Alignment >= DesiredAlignment || ST->getCPUDirective() != PPC::DIR_PWR9) + return Cost; + + // Since alignment may be under estimated, we try to compute the probability + // that the actual address is aligned to the desired boundary. For example + // an 8-byte aligned load is assumed to be actually 16-byte aligned half the + // time, while a 4-byte aligned load has a 25% chance of being 16-byte + // aligned. + float AlignmentProb = ((float)Alignment.value()) / DesiredAlignment.value(); + float MisalignmentProb = 1.0 - AlignmentProb; + return (MisalignmentProb * P9PipelineFlushEstimate) + + (AlignmentProb * *Cost.getValue()); + } + + // Usually we should not get to this point, but the following is an attempt to + // model the cost of legalization. Currently we can only lower intrinsics with + // evl but no mask, on Power 9/10. Otherwise, we must scalarize. + return getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); +} diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 7aeb0c59d503..0af6f2a308d9 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -134,9 +134,19 @@ public: bool UseMaskForCond = false, bool UseMaskForGaps = false); InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); - bool areFunctionArgsABICompatible(const Function *Caller, - const Function *Callee, - SmallPtrSetImpl<Argument *> &Args) const; + bool areTypesABICompatible(const Function *Caller, const Function *Callee, + const ArrayRef<Type *> &Types) const; + bool hasActiveVectorLength(unsigned Opcode, Type *DataType, + Align Alignment) const; + InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + +private: + // The following constant is used for estimating costs on power9. + static const InstructionCost::CostType P9PipelineFlushEstimate = 80; + /// @} }; diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index f00813f1301a..75592dd4c6f5 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -169,6 +169,7 @@ class RISCVAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseJALOffset(OperandVector &Operands); OperandMatchResultTy parseVTypeI(OperandVector &Operands); OperandMatchResultTy parseMaskReg(OperandVector &Operands); + OperandMatchResultTy parseInsnDirectiveOpcode(OperandVector &Operands); bool parseOperand(OperandVector &Operands, StringRef Mnemonic); @@ -827,6 +828,7 @@ public: Op->SysReg.Length = Str.size(); Op->SysReg.Encoding = Encoding; Op->StartLoc = S; + Op->EndLoc = S; Op->IsRV64 = IsRV64; return Op; } @@ -836,6 +838,7 @@ public: auto Op = std::make_unique<RISCVOperand>(KindTy::VType); Op->VType.Val = VTypeI; Op->StartLoc = S; + Op->EndLoc = S; Op->IsRV64 = IsRV64; return Op; } @@ -1291,7 +1294,7 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands, if (HadParens) Operands.push_back(RISCVOperand::createToken("(", FirstS, isRV64())); SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1); + SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size()); getLexer().Lex(); Operands.push_back(RISCVOperand::createReg(RegNo, S, E, isRV64())); } @@ -1305,6 +1308,67 @@ OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands, } OperandMatchResultTy +RISCVAsmParser::parseInsnDirectiveOpcode(OperandVector &Operands) { + SMLoc S = getLoc(); + SMLoc E; + const MCExpr *Res; + + switch (getLexer().getKind()) { + default: + return MatchOperand_NoMatch; + case AsmToken::LParen: + case AsmToken::Minus: + case AsmToken::Plus: + case AsmToken::Exclaim: + case AsmToken::Tilde: + case AsmToken::Integer: + case AsmToken::String: { + if (getParser().parseExpression(Res, E)) + return MatchOperand_ParseFail; + + auto *CE = dyn_cast<MCConstantExpr>(Res); + if (CE) { + int64_t Imm = CE->getValue(); + if (isUInt<7>(Imm)) { + Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64())); + return MatchOperand_Success; + } + } + + Twine Msg = "immediate must be an integer in the range"; + Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 7) - 1) + "]"); + return MatchOperand_ParseFail; + } + case AsmToken::Identifier: { + StringRef Identifier; + if (getParser().parseIdentifier(Identifier)) + return MatchOperand_ParseFail; + + auto Opcode = RISCVInsnOpcode::lookupRISCVOpcodeByName(Identifier); + if (Opcode) { + Res = MCConstantExpr::create(Opcode->Value, getContext()); + E = SMLoc::getFromPointer(S.getPointer() + Identifier.size()); + Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64())); + return MatchOperand_Success; + } + + Twine Msg = "operand must be a valid opcode name or an " + "integer in the range"; + Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 7) - 1) + "]"); + return MatchOperand_ParseFail; + } + case AsmToken::Percent: { + // Discard operand with modifier. + Twine Msg = "immediate must be an integer in the range"; + Error(S, Msg + " [" + Twine(0) + ", " + Twine((1 << 7) - 1) + "]"); + return MatchOperand_ParseFail; + } + } + + return MatchOperand_NoMatch; +} + +OperandMatchResultTy RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) { SMLoc S = getLoc(); const MCExpr *Res; @@ -1381,7 +1445,7 @@ RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) { OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) { SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1); + SMLoc E; const MCExpr *Res; switch (getLexer().getKind()) { @@ -1396,7 +1460,7 @@ OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) { case AsmToken::Integer: case AsmToken::String: case AsmToken::Identifier: - if (getParser().parseExpression(Res)) + if (getParser().parseExpression(Res, E)) return MatchOperand_ParseFail; break; case AsmToken::Percent: @@ -1410,7 +1474,7 @@ OperandMatchResultTy RISCVAsmParser::parseImmediate(OperandVector &Operands) { OperandMatchResultTy RISCVAsmParser::parseOperandWithModifier(OperandVector &Operands) { SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1); + SMLoc E; if (getLexer().getKind() != AsmToken::Percent) { Error(getLoc(), "expected '%' for operand modifier"); @@ -1449,7 +1513,6 @@ RISCVAsmParser::parseOperandWithModifier(OperandVector &Operands) { OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) { SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1); const MCExpr *Res; if (getLexer().getKind() != AsmToken::Identifier) @@ -1461,6 +1524,8 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) { if (getParser().parseIdentifier(Identifier)) return MatchOperand_ParseFail; + SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size()); + if (Identifier.consume_back("@plt")) { Error(getLoc(), "'@plt' operand not valid for instruction"); return MatchOperand_ParseFail; @@ -1492,7 +1557,7 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) { } const MCExpr *Expr; - if (getParser().parseExpression(Expr)) + if (getParser().parseExpression(Expr, E)) return MatchOperand_ParseFail; Res = MCBinaryExpr::create(Opcode, Res, Expr, getContext()); Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64())); @@ -1501,7 +1566,6 @@ OperandMatchResultTy RISCVAsmParser::parseBareSymbol(OperandVector &Operands) { OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) { SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1); const MCExpr *Res; if (getLexer().getKind() != AsmToken::Identifier) @@ -1515,6 +1579,8 @@ OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) { if (getParser().parseIdentifier(Identifier)) return MatchOperand_ParseFail; + SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size()); + RISCVMCExpr::VariantKind Kind = RISCVMCExpr::VK_RISCV_CALL; if (Identifier.consume_back("@plt")) Kind = RISCVMCExpr::VK_RISCV_CALL_PLT; @@ -1529,10 +1595,10 @@ OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) { OperandMatchResultTy RISCVAsmParser::parsePseudoJumpSymbol(OperandVector &Operands) { SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1); + SMLoc E; const MCExpr *Res; - if (getParser().parseExpression(Res)) + if (getParser().parseExpression(Res, E)) return MatchOperand_ParseFail; if (Res->getKind() != MCExpr::ExprKind::SymbolRef || @@ -1662,7 +1728,7 @@ OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) { if (RegNo != RISCV::V0) return MatchOperand_NoMatch; SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1); + SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size()); getLexer().Lex(); Operands.push_back(RISCVOperand::createReg(RegNo, S, E, isRV64())); } @@ -2062,7 +2128,11 @@ bool RISCVAsmParser::parseDirectiveAttribute() { "unexpected token in '.attribute' directive")) return true; - if (Tag == RISCVAttrs::ARCH) { + if (IsIntegerValue) + getTargetStreamer().emitAttribute(Tag, IntegerValue); + else if (Tag != RISCVAttrs::ARCH) + getTargetStreamer().emitTextAttribute(Tag, StringValue); + else { StringRef Arch = StringValue; for (auto Feature : RISCVFeatureKV) if (llvm::RISCVISAInfo::isSupportedExtensionFeature(Feature.Key)) @@ -2070,7 +2140,7 @@ bool RISCVAsmParser::parseDirectiveAttribute() { auto ParseResult = llvm::RISCVISAInfo::parseArchString( StringValue, /*EnableExperimentalExtension=*/true, - /*ExperimentalExtensionVersionCheck=*/false); + /*ExperimentalExtensionVersionCheck=*/true); if (!ParseResult) { std::string Buffer; raw_string_ostream OutputErrMsg(Buffer); @@ -2093,35 +2163,9 @@ bool RISCVAsmParser::parseDirectiveAttribute() { setFeatureBits(RISCV::Feature64Bit, "64bit"); else return Error(ValueExprLoc, "bad arch string " + Arch); - } - - if (IsIntegerValue) - getTargetStreamer().emitAttribute(Tag, IntegerValue); - else { - if (Tag != RISCVAttrs::ARCH) { - getTargetStreamer().emitTextAttribute(Tag, StringValue); - } else { - std::vector<std::string> FeatureVector; - RISCVFeatures::toFeatureVector(FeatureVector, getSTI().getFeatureBits()); - // Parse that by RISCVISAInfo-> - unsigned XLen = getFeatureBits(RISCV::Feature64Bit) ? 64 : 32; - auto ParseResult = llvm::RISCVISAInfo::parseFeatures(XLen, FeatureVector); - if (!ParseResult) { - std::string Buffer; - raw_string_ostream OutputErrMsg(Buffer); - handleAllErrors(ParseResult.takeError(), - [&](llvm::StringError &ErrMsg) { - OutputErrMsg << ErrMsg.getMessage(); - }); - - return Error(ValueExprLoc, OutputErrMsg.str()); - } - auto &ISAInfo = *ParseResult; - - // Then emit the arch string. - getTargetStreamer().emitTextAttribute(Tag, ISAInfo->toString()); - } + // Then emit the arch string. + getTargetStreamer().emitTextAttribute(Tag, ISAInfo->toString()); } return false; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp index 0aba18b20f0d..144e761f002d 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp @@ -27,6 +27,11 @@ namespace RISCVSysReg { #include "RISCVGenSearchableTables.inc" } // namespace RISCVSysReg +namespace RISCVInsnOpcode { +#define GET_RISCVOpcodesList_IMPL +#include "RISCVGenSearchableTables.inc" +} // namespace RISCVInsnOpcode + namespace RISCVABI { ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits, StringRef ABIName) { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index d8f4403c824f..9cfd36745f46 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -299,6 +299,16 @@ struct SysReg { #include "RISCVGenSearchableTables.inc" } // end namespace RISCVSysReg +namespace RISCVInsnOpcode { +struct RISCVOpcode { + const char *Name; + unsigned Value; +}; + +#define GET_RISCVOpcodesList_DECL +#include "RISCVGenSearchableTables.inc" +} // end namespace RISCVInsnOpcode + namespace RISCVABI { enum ABI { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index f1c3810f4ee5..89a7d54f60f8 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -171,9 +171,9 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNo).getImm(); // Print the raw immediate for reserved values: vlmul[2:0]=4, vsew[2:0]=0b1xx, - // or non-zero bits 8/9/10. + // or non-zero in bits 8 and above. if (RISCVVType::getVLMUL(Imm) == RISCVII::VLMUL::LMUL_RESERVED || - RISCVVType::getSEW(Imm) > 64 || (Imm & 0x700) != 0) { + RISCVVType::getSEW(Imm) > 64 || (Imm >> 8) != 0) { O << Imm; return; } diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td index 772a4f8ecd53..6aa915c01929 100644 --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -168,14 +168,6 @@ def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">, AssemblerPredicate<(all_of FeatureStdExtZvlsseg), "'Zvlsseg' (Vector segment load/store instructions)">; -def FeatureStdExtZvamo - : SubtargetFeature<"experimental-zvamo", "HasStdExtZvamo", "true", - "'Zvamo' (Vector AMO Operations)", - [FeatureStdExtV]>; -def HasStdExtZvamo : Predicate<"Subtarget->hasStdExtZvamo()">, - AssemblerPredicate<(all_of FeatureStdExtZvamo), - "'Zvamo' (Vector AMO Operations)">; - def Feature64Bit : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">; def IsRV64 : Predicate<"Subtarget->is64Bit()">, diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 66a34d73dd37..b24eb5f7bbf4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -718,6 +718,71 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { break; } + case ISD::MUL: { + // Special case for calculating (mul (and X, C2), C1) where the full product + // fits in XLen bits. We can shift X left by the number of leading zeros in + // C2 and shift C1 left by XLen-lzcnt(C2). This will ensure the final + // product has XLen trailing zeros, putting it in the output of MULHU. This + // can avoid materializing a constant in a register for C2. + + // RHS should be a constant. + auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1)); + if (!N1C || !N1C->hasOneUse()) + break; + + // LHS should be an AND with constant. + SDValue N0 = Node->getOperand(0); + if (N0.getOpcode() != ISD::AND || !isa<ConstantSDNode>(N0.getOperand(1))) + break; + + uint64_t C2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue(); + + // Constant should be a mask. + if (!isMask_64(C2)) + break; + + // This should be the only use of the AND unless we will use + // (SRLI (SLLI X, 32), 32). We don't use a shift pair for other AND + // constants. + if (!N0.hasOneUse() && C2 != UINT64_C(0xFFFFFFFF)) + break; + + // If this can be an ANDI, ZEXT.H or ZEXT.W we don't need to do this + // optimization. + if (isInt<12>(C2) || + (C2 == UINT64_C(0xFFFF) && + (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())) || + (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba())) + break; + + // We need to shift left the AND input and C1 by a total of XLen bits. + + // How far left do we need to shift the AND input? + unsigned XLen = Subtarget->getXLen(); + unsigned LeadingZeros = XLen - (64 - countLeadingZeros(C2)); + + // The constant gets shifted by the remaining amount unless that would + // shift bits out. + uint64_t C1 = N1C->getZExtValue(); + unsigned ConstantShift = XLen - LeadingZeros; + if (ConstantShift > (XLen - (64 - countLeadingZeros(C1)))) + break; + + uint64_t ShiftedC1 = C1 << ConstantShift; + // If this RV32, we need to sign extend the constant. + if (XLen == 32) + ShiftedC1 = SignExtend64(ShiftedC1, 32); + + // Create (mulhu (slli X, lzcnt(C2)), C1 << (XLen - lzcnt(C2))). + SDNode *Imm = selectImm(CurDAG, DL, ShiftedC1, *Subtarget); + SDNode *SLLI = + CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(LeadingZeros, DL, VT)); + SDNode *MULHU = CurDAG->getMachineNode(RISCV::MULHU, DL, VT, + SDValue(SLLI, 0), SDValue(Imm, 0)); + ReplaceNode(Node, MULHU); + return; + } case ISD::INTRINSIC_WO_CHAIN: { unsigned IntNo = Node->getConstantOperandVal(0); switch (IntNo) { @@ -1450,6 +1515,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, Extract.getNode()); return; } + case ISD::SPLAT_VECTOR: case RISCVISD::VMV_V_X_VL: case RISCVISD::VFMV_V_F_VL: { // Try to match splat of a scalar load to a strided load with stride of x0. @@ -1466,7 +1532,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { break; SDValue VL; - selectVLOp(Node->getOperand(1), VL); + if (Node->getOpcode() == ISD::SPLAT_VECTOR) + VL = CurDAG->getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT); + else + selectVLOp(Node->getOperand(1), VL); unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits()); SDValue SEW = CurDAG->getTargetConstant(Log2SEW, DL, XLenVT); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index f3331571fc55..4f5512e6fb37 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -330,6 +330,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f16, Legal); setOperationAction(ISD::LROUND, MVT::f16, Legal); setOperationAction(ISD::LLROUND, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FMA, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Legal); for (auto CC : FPCCToExpand) setCondCodeAction(CC, MVT::f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); @@ -367,6 +375,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f32, Legal); setOperationAction(ISD::LROUND, MVT::f32, Legal); setOperationAction(ISD::LLROUND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); for (auto CC : FPCCToExpand) setCondCodeAction(CC, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); @@ -388,6 +402,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LLRINT, MVT::f64, Legal); setOperationAction(ISD::LROUND, MVT::f64, Legal); setOperationAction(ISD::LLROUND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); for (auto CC : FPCCToExpand) setCondCodeAction(CC, MVT::f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::f64, Expand); @@ -412,6 +434,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT_SAT, XLenVT, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, XLenVT, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, XLenVT, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, XLenVT, Legal); + setOperationAction(ISD::STRICT_UINT_TO_FP, XLenVT, Legal); + setOperationAction(ISD::STRICT_SINT_TO_FP, XLenVT, Legal); + setOperationAction(ISD::FLT_ROUNDS_, XLenVT, Custom); setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); } @@ -471,12 +498,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, ISD::VP_XOR, ISD::VP_ASHR, ISD::VP_LSHR, ISD::VP_SHL, ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND, ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR, ISD::VP_REDUCE_SMAX, - ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN}; + ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN, + ISD::VP_SELECT}; static const unsigned FloatingPointVPOps[] = { ISD::VP_FADD, ISD::VP_FSUB, ISD::VP_FMUL, ISD::VP_FDIV, ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD, - ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX}; + ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_SELECT}; if (!Subtarget.is64Bit()) { // We must custom-lower certain vXi64 operations on RV32 due to the vector @@ -519,6 +547,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); + setOperationAction(ISD::VP_AND, VT, Custom); + setOperationAction(ISD::VP_OR, VT, Custom); + setOperationAction(ISD::VP_XOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); setOperationAction(ISD::VECREDUCE_OR, VT, Custom); setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); @@ -803,6 +835,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, // Operations below are different for between masks and other vectors. if (VT.getVectorElementType() == MVT::i1) { + setOperationAction(ISD::VP_AND, VT, Custom); + setOperationAction(ISD::VP_OR, VT, Custom); + setOperationAction(ISD::VP_XOR, VT, Custom); setOperationAction(ISD::AND, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::XOR, VT, Custom); @@ -1147,7 +1182,7 @@ bool RISCVTargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasStdExtZbb(); } -bool RISCVTargetLowering::hasAndNot(SDValue Y) const { +bool RISCVTargetLowering::hasAndNotCompare(SDValue Y) const { EVT VT = Y.getValueType(); // FIXME: Support vectors once we have tests. @@ -1235,7 +1270,8 @@ bool RISCVTargetLowering::shouldSinkOperands( bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const { - if (VT == MVT::f16 && !Subtarget.hasStdExtZfhmin()) + // FIXME: Change to Zfhmin once f16 becomes a legal type with Zfhmin. + if (VT == MVT::f16 && !Subtarget.hasStdExtZfh()) return false; if (VT == MVT::f32 && !Subtarget.hasStdExtF()) return false; @@ -1255,9 +1291,10 @@ bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const { MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // Use f32 to pass f16 if it is legal and Zfhmin/Zfh is not enabled. + // Use f32 to pass f16 if it is legal and Zfh is not enabled. // We might still end up using a GPR but that will be decided based on ABI. - if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfhmin()) + // FIXME: Change to Zfhmin once f16 becomes a legal type with Zfhmin. + if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfh()) return MVT::f32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); @@ -1266,9 +1303,10 @@ MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // Use f32 to pass f16 if it is legal and Zfhmin/Zfh is not enabled. + // Use f32 to pass f16 if it is legal and Zfh is not enabled. // We might still end up using a GPR but that will be decided based on ABI. - if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfhmin()) + // FIXME: Change to Zfhmin once f16 becomes a legal type with Zfhmin. + if (VT == MVT::f16 && Subtarget.hasStdExtF() && !Subtarget.hasStdExtZfh()) return 1; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); @@ -1959,29 +1997,37 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, int64_t StepNumerator = SimpleVID->StepNumerator; unsigned StepDenominator = SimpleVID->StepDenominator; int64_t Addend = SimpleVID->Addend; + + assert(StepNumerator != 0 && "Invalid step"); + bool Negate = false; + int64_t SplatStepVal = StepNumerator; + unsigned StepOpcode = ISD::MUL; + if (StepNumerator != 1) { + if (isPowerOf2_64(std::abs(StepNumerator))) { + Negate = StepNumerator < 0; + StepOpcode = ISD::SHL; + SplatStepVal = Log2_64(std::abs(StepNumerator)); + } + } + // Only emit VIDs with suitably-small steps/addends. We use imm5 is a // threshold since it's the immediate value many RVV instructions accept. - if (isInt<5>(StepNumerator) && isPowerOf2_32(StepDenominator) && - isInt<5>(Addend)) { + // There is no vmul.vi instruction so ensure multiply constant can fit in + // a single addi instruction. + if (((StepOpcode == ISD::MUL && isInt<12>(SplatStepVal)) || + (StepOpcode == ISD::SHL && isUInt<5>(SplatStepVal))) && + isPowerOf2_32(StepDenominator) && isInt<5>(Addend)) { SDValue VID = DAG.getNode(RISCVISD::VID_VL, DL, ContainerVT, Mask, VL); // Convert right out of the scalable type so we can use standard ISD // nodes for the rest of the computation. If we used scalable types with // these, we'd lose the fixed-length vector info and generate worse // vsetvli code. VID = convertFromScalableVector(VT, VID, DAG, Subtarget); - assert(StepNumerator != 0 && "Invalid step"); - bool Negate = false; - if (StepNumerator != 1) { - int64_t SplatStepVal = StepNumerator; - unsigned Opcode = ISD::MUL; - if (isPowerOf2_64(std::abs(StepNumerator))) { - Negate = StepNumerator < 0; - Opcode = ISD::SHL; - SplatStepVal = Log2_64(std::abs(StepNumerator)); - } + if ((StepOpcode == ISD::MUL && SplatStepVal != 1) || + (StepOpcode == ISD::SHL && SplatStepVal != 0)) { SDValue SplatStep = DAG.getSplatVector( VT, DL, DAG.getConstant(SplatStepVal, DL, XLenVT)); - VID = DAG.getNode(Opcode, DL, VT, VID, SplatStep); + VID = DAG.getNode(StepOpcode, DL, VT, VID, SplatStep); } if (StepDenominator != 1) { SDValue SplatStep = DAG.getSplatVector( @@ -3133,6 +3179,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: return lowerSET_ROUNDING(Op, DAG); + case ISD::VP_SELECT: + return lowerVPOp(Op, DAG, RISCVISD::VSELECT_VL); case ISD::VP_ADD: return lowerVPOp(Op, DAG, RISCVISD::ADD_VL); case ISD::VP_SUB: @@ -3148,11 +3196,11 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::VP_UREM: return lowerVPOp(Op, DAG, RISCVISD::UREM_VL); case ISD::VP_AND: - return lowerVPOp(Op, DAG, RISCVISD::AND_VL); + return lowerLogicVPOp(Op, DAG, RISCVISD::VMAND_VL, RISCVISD::AND_VL); case ISD::VP_OR: - return lowerVPOp(Op, DAG, RISCVISD::OR_VL); + return lowerLogicVPOp(Op, DAG, RISCVISD::VMOR_VL, RISCVISD::OR_VL); case ISD::VP_XOR: - return lowerVPOp(Op, DAG, RISCVISD::XOR_VL); + return lowerLogicVPOp(Op, DAG, RISCVISD::VMXOR_VL, RISCVISD::XOR_VL); case ISD::VP_ASHR: return lowerVPOp(Op, DAG, RISCVISD::SRA_VL); case ISD::VP_LSHR: @@ -4469,19 +4517,19 @@ SDValue RISCVTargetLowering::lowerVECREDUCE(SDValue Op, } MVT M1VT = getLMUL1VT(ContainerVT); + MVT XLenVT = Subtarget.getXLenVT(); SDValue Mask, VL; std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); - // FIXME: This is a VLMAX splat which might be too large and can prevent - // vsetvli removal. SDValue NeutralElem = DAG.getNeutralElement(BaseOpc, DL, VecEltVT, SDNodeFlags()); - SDValue IdentitySplat = DAG.getSplatVector(M1VT, DL, NeutralElem); + SDValue IdentitySplat = lowerScalarSplat( + NeutralElem, DAG.getConstant(1, DL, XLenVT), M1VT, DL, DAG, Subtarget); SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), Vec, IdentitySplat, Mask, VL); SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction, - DAG.getConstant(0, DL, Subtarget.getXLenVT())); + DAG.getConstant(0, DL, XLenVT)); return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType()); } @@ -4497,9 +4545,12 @@ getRVVFPReductionOpAndOperands(SDValue Op, SelectionDAG &DAG, EVT EltVT) { switch (Opcode) { default: llvm_unreachable("Unhandled reduction"); - case ISD::VECREDUCE_FADD: - return std::make_tuple(RISCVISD::VECREDUCE_FADD_VL, Op.getOperand(0), - DAG.getNeutralElement(BaseOpcode, DL, EltVT, Flags)); + case ISD::VECREDUCE_FADD: { + // Use positive zero if we can. It is cheaper to materialize. + SDValue Zero = + DAG.getConstantFP(Flags.hasNoSignedZeros() ? 0.0 : -0.0, DL, EltVT); + return std::make_tuple(RISCVISD::VECREDUCE_FADD_VL, Op.getOperand(0), Zero); + } case ISD::VECREDUCE_SEQ_FADD: return std::make_tuple(RISCVISD::VECREDUCE_SEQ_FADD_VL, Op.getOperand(1), Op.getOperand(0)); @@ -4530,17 +4581,17 @@ SDValue RISCVTargetLowering::lowerFPVECREDUCE(SDValue Op, } MVT M1VT = getLMUL1VT(VectorVal.getSimpleValueType()); + MVT XLenVT = Subtarget.getXLenVT(); SDValue Mask, VL; std::tie(Mask, VL) = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); - // FIXME: This is a VLMAX splat which might be too large and can prevent - // vsetvli removal. - SDValue ScalarSplat = DAG.getSplatVector(M1VT, DL, ScalarVal); + SDValue ScalarSplat = lowerScalarSplat( + ScalarVal, DAG.getConstant(1, DL, XLenVT), M1VT, DL, DAG, Subtarget); SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, DAG.getUNDEF(M1VT), VectorVal, ScalarSplat, Mask, VL); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VecEltVT, Reduction, - DAG.getConstant(0, DL, Subtarget.getXLenVT())); + DAG.getConstant(0, DL, XLenVT)); } static unsigned getRVVVPReductionOp(unsigned ISDOpcode) { @@ -4602,13 +4653,13 @@ SDValue RISCVTargetLowering::lowerVPREDUCE(SDValue Op, MVT XLenVT = Subtarget.getXLenVT(); MVT ResVT = !VecVT.isInteger() || VecEltVT.bitsGE(XLenVT) ? VecEltVT : XLenVT; - // FIXME: This is a VLMAX splat which might be too large and can prevent - // vsetvli removal. - SDValue StartSplat = DAG.getSplatVector(M1VT, DL, Op.getOperand(0)); + SDValue StartSplat = + lowerScalarSplat(Op.getOperand(0), DAG.getConstant(1, DL, XLenVT), M1VT, + DL, DAG, Subtarget); SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, StartSplat, Vec, StartSplat, Mask, VL); SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction, - DAG.getConstant(0, DL, Subtarget.getXLenVT())); + DAG.getConstant(0, DL, XLenVT)); if (!VecVT.isInteger()) return Elt0; return DAG.getSExtOrTrunc(Elt0, DL, Op.getValueType()); @@ -5365,6 +5416,33 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG, return convertFromScalableVector(VT, VPOp, DAG, Subtarget); } +SDValue RISCVTargetLowering::lowerLogicVPOp(SDValue Op, SelectionDAG &DAG, + unsigned MaskOpc, + unsigned VecOpc) const { + MVT VT = Op.getSimpleValueType(); + if (VT.getVectorElementType() != MVT::i1) + return lowerVPOp(Op, DAG, VecOpc); + + // It is safe to drop mask parameter as masked-off elements are undef. + SDValue Op1 = Op->getOperand(0); + SDValue Op2 = Op->getOperand(1); + SDValue VL = Op->getOperand(3); + + MVT ContainerVT = VT; + const bool IsFixed = VT.isFixedLengthVector(); + if (IsFixed) { + ContainerVT = getContainerForFixedLengthVector(VT); + Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget); + Op2 = convertToScalableVector(ContainerVT, Op2, DAG, Subtarget); + } + + SDLoc DL(Op); + SDValue Val = DAG.getNode(MaskOpc, DL, ContainerVT, Op1, Op2, VL); + if (!IsFixed) + return Val; + return convertFromScalableVector(VT, Val, DAG, Subtarget); +} + // Custom lower MGATHER/VP_GATHER to a legalized form for RVV. It will then be // matched to a RVV indexed load. The RVV indexed load instructions only // support the "unsigned unscaled" addressing mode; indices are implicitly @@ -5695,11 +5773,17 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, SDValue Op0 = IsStrict ? N->getOperand(1) : N->getOperand(0); if (getTypeAction(*DAG.getContext(), Op0.getValueType()) != TargetLowering::TypeSoftenFloat) { - // FIXME: Support strict FP. - if (IsStrict) - return; if (!isTypeLegal(Op0.getValueType())) return; + if (IsStrict) { + unsigned Opc = IsSigned ? RISCVISD::STRICT_FCVT_W_RTZ_RV64 + : RISCVISD::STRICT_FCVT_WU_RTZ_RV64; + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); + SDValue Res = DAG.getNode(Opc, DL, VTs, N->getOperand(0), Op0); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res)); + Results.push_back(Res.getValue(1)); + return; + } unsigned Opc = IsSigned ? RISCVISD::FCVT_W_RTZ_RV64 : RISCVISD::FCVT_WU_RTZ_RV64; SDValue Res = DAG.getNode(Opc, DL, MVT::i64, Op0); @@ -7026,7 +7110,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, if (SimplifyDemandedLowBitsHelper(1, Log2_32(BitWidth))) return SDValue(N, 0); - return combineGREVI_GORCI(N, DCI.DAG); + return combineGREVI_GORCI(N, DAG); } case RISCVISD::GREVW: case RISCVISD::GORCW: { @@ -7035,7 +7119,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, SimplifyDemandedLowBitsHelper(1, 5)) return SDValue(N, 0); - return combineGREVI_GORCI(N, DCI.DAG); + return combineGREVI_GORCI(N, DAG); } case RISCVISD::SHFL: case RISCVISD::UNSHFL: { @@ -7120,11 +7204,23 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, // Fold (zero_extend (fp_to_uint X)) to prevent forming fcvt+zexti32 during // type legalization. This is safe because fp_to_uint produces poison if // it overflows. - if (N->getValueType(0) == MVT::i64 && Subtarget.is64Bit() && - N->getOperand(0).getOpcode() == ISD::FP_TO_UINT && - isTypeLegal(N->getOperand(0).getOperand(0).getValueType())) - return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), MVT::i64, - N->getOperand(0).getOperand(0)); + if (N->getValueType(0) == MVT::i64 && Subtarget.is64Bit()) { + SDValue Src = N->getOperand(0); + if (Src.getOpcode() == ISD::FP_TO_UINT && + isTypeLegal(Src.getOperand(0).getValueType())) + return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), MVT::i64, + Src.getOperand(0)); + if (Src.getOpcode() == ISD::STRICT_FP_TO_UINT && Src.hasOneUse() && + isTypeLegal(Src.getOperand(1).getValueType())) { + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); + SDValue Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, SDLoc(N), VTs, + Src.getOperand(0), Src.getOperand(1)); + DCI.CombineTo(N, Res); + DAG.ReplaceAllUsesOfValueWith(Src.getValue(1), Res.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(Src.getNode()); + return SDValue(N, 0); // Return N so it doesn't get rechecked. + } + } return SDValue(); case RISCVISD::SELECT_CC: { // Transform @@ -7685,6 +7781,8 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode( case RISCVISD::BDECOMPRESSW: case RISCVISD::FCVT_W_RTZ_RV64: case RISCVISD::FCVT_WU_RTZ_RV64: + case RISCVISD::STRICT_FCVT_W_RTZ_RV64: + case RISCVISD::STRICT_FCVT_WU_RTZ_RV64: // TODO: As the result is sign-extended, this is conservatively correct. A // more precise answer could be calculated for SRAW depending on known // bits in the shift amount. @@ -8004,6 +8102,22 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } } +void RISCVTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const { + // Add FRM dependency to any instructions with dynamic rounding mode. + unsigned Opc = MI.getOpcode(); + auto Idx = RISCV::getNamedOperandIdx(Opc, RISCV::OpName::frm); + if (Idx < 0) + return; + if (MI.getOperand(Idx).getImm() != RISCVFPRndMode::DYN) + return; + // If the instruction already reads FRM, don't add another read. + if (MI.readsRegister(RISCV::FRM)) + return; + MI.addOperand( + MachineOperand::CreateReg(RISCV::FRM, /*isDef*/ false, /*isImp*/ true)); +} + // Calling Convention Implementation. // The expectations for frontend ABI lowering vary from target to target. // Ideally, an LLVM frontend would be able to avoid worrying about many ABI @@ -9400,6 +9514,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FCVT_XU_RTZ) NODE_NAME_CASE(FCVT_W_RTZ_RV64) NODE_NAME_CASE(FCVT_WU_RTZ_RV64) + NODE_NAME_CASE(STRICT_FCVT_W_RTZ_RV64) + NODE_NAME_CASE(STRICT_FCVT_WU_RTZ_RV64) NODE_NAME_CASE(READ_CYCLE_WIDE) NODE_NAME_CASE(GREV) NODE_NAME_CASE(GREVW) @@ -9541,6 +9657,9 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': + // TODO: Support fixed vectors up to XLen for P extension? + if (VT.isVector()) + break; return std::make_pair(0U, &RISCV::GPRRegClass); case 'f': if (Subtarget.hasStdExtZfh() && VT == MVT::f16) @@ -9553,17 +9672,15 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, default: break; } - } else { - if (Constraint == "vr") { - for (const auto *RC : {&RISCV::VRRegClass, &RISCV::VRM2RegClass, - &RISCV::VRM4RegClass, &RISCV::VRM8RegClass}) { - if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) - return std::make_pair(0U, RC); - } - } else if (Constraint == "vm") { - if (TRI->isTypeLegalForClass(RISCV::VMRegClass, VT.SimpleTy)) - return std::make_pair(0U, &RISCV::VMRegClass); + } else if (Constraint == "vr") { + for (const auto *RC : {&RISCV::VRRegClass, &RISCV::VRM2RegClass, + &RISCV::VRM4RegClass, &RISCV::VRM8RegClass}) { + if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) + return std::make_pair(0U, RC); } + } else if (Constraint == "vm") { + if (TRI->isTypeLegalForClass(RISCV::VMV0RegClass, VT.SimpleTy)) + return std::make_pair(0U, &RISCV::VMV0RegClass); } // Clang will correctly decode the usage of register name aliases into their @@ -10101,17 +10218,29 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts( unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinSize(); unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinSize(); if (PartVTBitSize % ValueVTBitSize == 0) { + assert(PartVTBitSize >= ValueVTBitSize); // If the element types are different, bitcast to the same element type of // PartVT first. + // Give an example here, we want copy a <vscale x 1 x i8> value to + // <vscale x 4 x i16>. + // We need to convert <vscale x 1 x i8> to <vscale x 8 x i8> by insert + // subvector, then we can bitcast to <vscale x 4 x i16>. if (ValueEltVT != PartEltVT) { - unsigned Count = ValueVTBitSize / PartEltVT.getSizeInBits(); - assert(Count != 0 && "The number of element should not be zero."); - EVT SameEltTypeVT = - EVT::getVectorVT(Context, PartEltVT, Count, /*IsScalable=*/true); - Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val); + if (PartVTBitSize > ValueVTBitSize) { + unsigned Count = PartVTBitSize / ValueEltVT.getFixedSizeInBits(); + assert(Count != 0 && "The number of element should not be zero."); + EVT SameEltTypeVT = + EVT::getVectorVT(Context, ValueEltVT, Count, /*IsScalable=*/true); + Val = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SameEltTypeVT, + DAG.getUNDEF(SameEltTypeVT), Val, + DAG.getVectorIdxConstant(0, DL)); + } + Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); + } else { + Val = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT), + Val, DAG.getVectorIdxConstant(0, DL)); } - Val = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT), - Val, DAG.getConstant(0, DL, Subtarget.getXLenVT())); Parts[0] = Val; return true; } @@ -10141,19 +10270,23 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue( unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinSize(); unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinSize(); if (PartVTBitSize % ValueVTBitSize == 0) { + assert(PartVTBitSize >= ValueVTBitSize); EVT SameEltTypeVT = ValueVT; // If the element types are different, convert it to the same element type // of PartVT. + // Give an example here, we want copy a <vscale x 1 x i8> value from + // <vscale x 4 x i16>. + // We need to convert <vscale x 4 x i16> to <vscale x 8 x i8> first, + // then we can extract <vscale x 1 x i8>. if (ValueEltVT != PartEltVT) { - unsigned Count = ValueVTBitSize / PartEltVT.getSizeInBits(); + unsigned Count = PartVTBitSize / ValueEltVT.getFixedSizeInBits(); assert(Count != 0 && "The number of element should not be zero."); SameEltTypeVT = - EVT::getVectorVT(Context, PartEltVT, Count, /*IsScalable=*/true); + EVT::getVectorVT(Context, ValueEltVT, Count, /*IsScalable=*/true); + Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val); } - Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SameEltTypeVT, Val, - DAG.getConstant(0, DL, Subtarget.getXLenVT())); - if (ValueEltVT != PartEltVT) - Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); + Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val, + DAG.getVectorIdxConstant(0, DL)); return Val; } } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 849928eb46ae..48c5ce730933 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -282,6 +282,11 @@ enum NodeType : unsigned { // the value read before the modification and the new chain pointer. SWAP_CSR, + // FP to 32 bit int conversions for RV64. These are used to keep track of the + // result being sign extended to 64 bit. These saturate out of range inputs. + STRICT_FCVT_W_RTZ_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE, + STRICT_FCVT_WU_RTZ_RV64, + // Memory opcodes start here. VLE_VL = ISD::FIRST_TARGET_MEMORY_OPCODE, VSE_VL, @@ -315,7 +320,7 @@ public: bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; - bool hasAndNot(SDValue Y) const override; + bool hasAndNotCompare(SDValue Y) const override; bool shouldSinkOperands(Instruction *I, SmallVectorImpl<Use *> &Ops) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT, @@ -383,6 +388,9 @@ public: EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; + void AdjustInstrPostInstrSelection(MachineInstr &MI, + SDNode *Node) const override; + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -593,6 +601,8 @@ private: SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG, unsigned NewOpc, bool HasMask = true) const; SDValue lowerVPOp(SDValue Op, SelectionDAG &DAG, unsigned RISCVISDOpc) const; + SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG, unsigned MaskOpc, + unsigned VecOpc) const; SDValue lowerFixedLengthVectorExtendToRVV(SDValue Op, SelectionDAG &DAG, unsigned ExtendOpc) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index cfad4cdb9364..6a16b6354f95 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -107,31 +107,44 @@ def Vcompress : RISCVVConstraint<!or(VS2Constraint.Value, // The following opcode names match those given in Table 19.1 in the // RISC-V User-level ISA specification ("RISC-V base opcode map"). -class RISCVOpcode<bits<7> val> { +class RISCVOpcode<string name, bits<7> val> { + string Name = name; bits<7> Value = val; } -def OPC_LOAD : RISCVOpcode<0b0000011>; -def OPC_LOAD_FP : RISCVOpcode<0b0000111>; -def OPC_MISC_MEM : RISCVOpcode<0b0001111>; -def OPC_OP_IMM : RISCVOpcode<0b0010011>; -def OPC_AUIPC : RISCVOpcode<0b0010111>; -def OPC_OP_IMM_32 : RISCVOpcode<0b0011011>; -def OPC_STORE : RISCVOpcode<0b0100011>; -def OPC_STORE_FP : RISCVOpcode<0b0100111>; -def OPC_AMO : RISCVOpcode<0b0101111>; -def OPC_OP : RISCVOpcode<0b0110011>; -def OPC_LUI : RISCVOpcode<0b0110111>; -def OPC_OP_32 : RISCVOpcode<0b0111011>; -def OPC_MADD : RISCVOpcode<0b1000011>; -def OPC_MSUB : RISCVOpcode<0b1000111>; -def OPC_NMSUB : RISCVOpcode<0b1001011>; -def OPC_NMADD : RISCVOpcode<0b1001111>; -def OPC_OP_FP : RISCVOpcode<0b1010011>; -def OPC_OP_V : RISCVOpcode<0b1010111>; -def OPC_BRANCH : RISCVOpcode<0b1100011>; -def OPC_JALR : RISCVOpcode<0b1100111>; -def OPC_JAL : RISCVOpcode<0b1101111>; -def OPC_SYSTEM : RISCVOpcode<0b1110011>; +def RISCVOpcodesList : GenericTable { + let FilterClass = "RISCVOpcode"; + let Fields = [ + "Name", "Value" + ]; + let PrimaryKey = [ "Value" ]; + let PrimaryKeyName = "lookupRISCVOpcodeByValue"; +} +def lookupRISCVOpcodeByName : SearchIndex { + let Table = RISCVOpcodesList; + let Key = [ "Name" ]; +} +def OPC_LOAD : RISCVOpcode<"LOAD", 0b0000011>; +def OPC_LOAD_FP : RISCVOpcode<"LOAD_FP", 0b0000111>; +def OPC_MISC_MEM : RISCVOpcode<"MISC_MEM", 0b0001111>; +def OPC_OP_IMM : RISCVOpcode<"OP_IMM", 0b0010011>; +def OPC_AUIPC : RISCVOpcode<"AUIPC", 0b0010111>; +def OPC_OP_IMM_32 : RISCVOpcode<"OP_IMM_32", 0b0011011>; +def OPC_STORE : RISCVOpcode<"STORE", 0b0100011>; +def OPC_STORE_FP : RISCVOpcode<"STORE_FP", 0b0100111>; +def OPC_AMO : RISCVOpcode<"AMO", 0b0101111>; +def OPC_OP : RISCVOpcode<"OP", 0b0110011>; +def OPC_LUI : RISCVOpcode<"LUI", 0b0110111>; +def OPC_OP_32 : RISCVOpcode<"OP_32", 0b0111011>; +def OPC_MADD : RISCVOpcode<"MADD", 0b1000011>; +def OPC_MSUB : RISCVOpcode<"MSUB", 0b1000111>; +def OPC_NMSUB : RISCVOpcode<"NMSUB", 0b1001011>; +def OPC_NMADD : RISCVOpcode<"NMADD", 0b1001111>; +def OPC_OP_FP : RISCVOpcode<"OP_FP", 0b1010011>; +def OPC_OP_V : RISCVOpcode<"OP_V", 0b1010111>; +def OPC_BRANCH : RISCVOpcode<"BRANCH", 0b1100011>; +def OPC_JALR : RISCVOpcode<"JALR", 0b1100111>; +def OPC_JAL : RISCVOpcode<"JAL", 0b1101111>; +def OPC_SYSTEM : RISCVOpcode<"SYSTEM", 0b1110011>; class RVInst<dag outs, dag ins, string opcodestr, string argstr, list<dag> pattern, InstFormat format> @@ -188,8 +201,7 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr, // Pseudo instructions class Pseudo<dag outs, dag ins, list<dag> pattern, string opcodestr = "", string argstr = ""> - : RVInst<outs, ins, opcodestr, argstr, pattern, InstFormatPseudo>, - Sched<[]> { + : RVInst<outs, ins, opcodestr, argstr, pattern, InstFormatPseudo> { let isPseudo = 1; let isCodeGenOnly = 1; } @@ -265,14 +277,14 @@ class RVInstR4Frm<bits<2> funct2, RISCVOpcode opcode, dag outs, dag ins, bits<5> rs3; bits<5> rs2; bits<5> rs1; - bits<3> funct3; + bits<3> frm; bits<5> rd; let Inst{31-27} = rs3; let Inst{26-25} = funct2; let Inst{24-20} = rs2; let Inst{19-15} = rs1; - let Inst{14-12} = funct3; + let Inst{14-12} = frm; let Inst{11-7} = rd; let Opcode = opcode.Value; } @@ -300,13 +312,13 @@ class RVInstRFrm<bits<7> funct7, RISCVOpcode opcode, dag outs, dag ins, : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> { bits<5> rs2; bits<5> rs1; - bits<3> funct3; + bits<3> frm; bits<5> rd; let Inst{31-25} = funct7; let Inst{24-20} = rs2; let Inst{19-15} = rs1; - let Inst{14-12} = funct3; + let Inst{14-12} = frm; let Inst{11-7} = rd; let Opcode = opcode.Value; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td index 80f46b73bfd7..69e9d3553b30 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td @@ -45,19 +45,6 @@ def SUMOPUnitStride : RISCVLSUMOP<0b00000>; def SUMOPUnitStrideMask : RISCVLSUMOP<0b01011>; def SUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>; -class RISCVAMOOP<bits<5> val> { - bits<5> Value = val; -} -def AMOOPVamoSwap : RISCVAMOOP<0b00001>; -def AMOOPVamoAdd : RISCVAMOOP<0b00000>; -def AMOOPVamoXor : RISCVAMOOP<0b00100>; -def AMOOPVamoAnd : RISCVAMOOP<0b01100>; -def AMOOPVamoOr : RISCVAMOOP<0b01000>; -def AMOOPVamoMin : RISCVAMOOP<0b10000>; -def AMOOPVamoMax : RISCVAMOOP<0b10100>; -def AMOOPVamoMinu : RISCVAMOOP<0b11000>; -def AMOOPVamoMaxu : RISCVAMOOP<0b11100>; - class RISCVWidth<bits<4> val> { bits<4> Value = val; } @@ -342,22 +329,3 @@ class RVInstVSX<bits<3> nf, bit mew, RISCVMOP mop, bits<3> width, let Uses = [VTYPE, VL]; } - -class RVInstVAMO<RISCVAMOOP amoop, bits<3> width, dag outs, - dag ins, string opcodestr, string argstr> - : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> { - bits<5> vs2; - bits<5> rs1; - bit wd; - bit vm; - - let Inst{31-27} = amoop.Value; - let Inst{26} = wd; - let Inst{25} = vm; - let Inst{24-20} = vs2; - let Inst{19-15} = rs1; - let Inst{14-12} = width; - let Opcode = OPC_AMO.Value; - - let Uses = [VTYPE, VL]; -} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 547d82550cac..2e2e00886d57 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -35,6 +35,7 @@ using namespace llvm; #include "RISCVGenCompressInstEmitter.inc" #define GET_INSTRINFO_CTOR_DTOR +#define GET_INSTRINFO_NAMED_OPS #include "RISCVGenInstrInfo.inc" static cl::opt<bool> PreferWholeRegisterMove( @@ -1059,6 +1060,7 @@ bool RISCVInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { break; case RISCV::FSGNJ_D: case RISCV::FSGNJ_S: + case RISCV::FSGNJ_H: // The canonical floating-point move is fsgnj rd, rs, rs. return MI.getOperand(1).isReg() && MI.getOperand(2).isReg() && MI.getOperand(1).getReg() == MI.getOperand(2).getReg(); @@ -1087,6 +1089,7 @@ RISCVInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { break; case RISCV::FSGNJ_D: case RISCV::FSGNJ_S: + case RISCV::FSGNJ_H: // The canonical floating-point move is fsgnj rd, rs, rs. if (MI.getOperand(1).isReg() && MI.getOperand(2).isReg() && MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) @@ -1254,7 +1257,7 @@ bool RISCVInstrInfo::isFunctionSafeToOutlineFrom( bool RISCVInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, unsigned &Flags) const { // More accurate safety checking is done in getOutliningCandidateInfo. - return true; + return TargetInstrInfo::isMBBSafeToOutlineFrom(MBB, Flags); } // Enum values indicating how an outlined call should be constructed. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 2bfad7844c43..da0877c4299a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -18,6 +18,7 @@ #include "llvm/IR/DiagnosticInfo.h" #define GET_INSTRINFO_HEADER +#define GET_INSTRINFO_OPERAND_ENUM #include "RISCVGenInstrInfo.inc" namespace llvm { @@ -181,6 +182,10 @@ protected: }; namespace RISCV { + +// Implemented in RISCVGenInstrInfo.inc +int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); + // Special immediate for AVL operand of V pseudo instructions to indicate VLMax. static constexpr int64_t VLMaxSentinel = -1LL; } // namespace RISCV diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 6f9cde966132..71eb6f01a4f4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -174,6 +174,20 @@ def uimm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isUInt<5>(Imm);}]> { let OperandNamespace = "RISCVOp"; } +def InsnDirectiveOpcode : AsmOperandClass { + let Name = "InsnDirectiveOpcode"; + let ParserMethod = "parseInsnDirectiveOpcode"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isImm"; +} + +def uimm7_opcode : Operand<XLenVT> { + let ParserMatchClass = InsnDirectiveOpcode; + let DecoderMethod = "decodeUImmOperand<7>"; + let OperandType = "OPERAND_UIMM7"; + let OperandNamespace = "RISCVOp"; +} + def uimm7 : Operand<XLenVT> { let ParserMatchClass = UImmAsmOperand<7>; let DecoderMethod = "decodeUImmOperand<7>"; @@ -878,35 +892,35 @@ def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF), 0>; // isCodeGenOnly = 1 to hide them from the tablegened assembly parser. let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1, hasNoSchedulingInfo = 1 in { -def InsnR : DirectiveInsnR<(outs AnyReg:$rd), (ins uimm7:$opcode, uimm3:$funct3, +def InsnR : DirectiveInsnR<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm3:$funct3, uimm7:$funct7, AnyReg:$rs1, AnyReg:$rs2), "$opcode, $funct3, $funct7, $rd, $rs1, $rs2">; -def InsnR4 : DirectiveInsnR4<(outs AnyReg:$rd), (ins uimm7:$opcode, +def InsnR4 : DirectiveInsnR4<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm3:$funct3, uimm2:$funct2, AnyReg:$rs1, AnyReg:$rs2, AnyReg:$rs3), "$opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3">; -def InsnI : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7:$opcode, uimm3:$funct3, +def InsnI : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, simm12:$imm12), "$opcode, $funct3, $rd, $rs1, $imm12">; -def InsnI_Mem : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7:$opcode, +def InsnI_Mem : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, simm12:$imm12), "$opcode, $funct3, $rd, ${imm12}(${rs1})">; -def InsnB : DirectiveInsnB<(outs), (ins uimm7:$opcode, uimm3:$funct3, +def InsnB : DirectiveInsnB<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, AnyReg:$rs2, simm13_lsb0:$imm12), "$opcode, $funct3, $rs1, $rs2, $imm12">; -def InsnU : DirectiveInsnU<(outs AnyReg:$rd), (ins uimm7:$opcode, +def InsnU : DirectiveInsnU<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm20_lui:$imm20), "$opcode, $rd, $imm20">; -def InsnJ : DirectiveInsnJ<(outs AnyReg:$rd), (ins uimm7:$opcode, +def InsnJ : DirectiveInsnJ<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, simm21_lsb0_jal:$imm20), "$opcode, $rd, $imm20">; -def InsnS : DirectiveInsnS<(outs), (ins uimm7:$opcode, uimm3:$funct3, +def InsnS : DirectiveInsnS<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2, AnyReg:$rs1, simm12:$imm12), "$opcode, $funct3, $rs2, ${imm12}(${rs1})">; @@ -918,37 +932,37 @@ def InsnS : DirectiveInsnS<(outs), (ins uimm7:$opcode, uimm3:$funct3, // for known formats. let EmitPriority = 0 in { def : InstAlias<".insn_r $opcode, $funct3, $funct7, $rd, $rs1, $rs2", - (InsnR AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm7:$funct7, + (InsnR AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, uimm7:$funct7, AnyReg:$rs1, AnyReg:$rs2)>; // Accept 4 register form of ".insn r" as alias for ".insn r4". def : InstAlias<".insn_r $opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3", - (InsnR4 AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm2:$funct2, + (InsnR4 AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, uimm2:$funct2, AnyReg:$rs1, AnyReg:$rs2, AnyReg:$rs3)>; def : InstAlias<".insn_r4 $opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3", - (InsnR4 AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, uimm2:$funct2, + (InsnR4 AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, uimm2:$funct2, AnyReg:$rs1, AnyReg:$rs2, AnyReg:$rs3)>; def : InstAlias<".insn_i $opcode, $funct3, $rd, $rs1, $imm12", - (InsnI AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1, + (InsnI AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, simm12:$imm12)>; def : InstAlias<".insn_i $opcode, $funct3, $rd, ${imm12}(${rs1})", - (InsnI_Mem AnyReg:$rd, uimm7:$opcode, uimm3:$funct3, + (InsnI_Mem AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, simm12:$imm12)>; def : InstAlias<".insn_b $opcode, $funct3, $rs1, $rs2, $imm12", - (InsnB uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1, + (InsnB uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, AnyReg:$rs2, simm13_lsb0:$imm12)>; // Accept sb as an alias for b. def : InstAlias<".insn_sb $opcode, $funct3, $rs1, $rs2, $imm12", - (InsnB uimm7:$opcode, uimm3:$funct3, AnyReg:$rs1, + (InsnB uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, AnyReg:$rs2, simm13_lsb0:$imm12)>; def : InstAlias<".insn_u $opcode, $rd, $imm20", - (InsnU AnyReg:$rd, uimm7:$opcode, uimm20_lui:$imm20)>; + (InsnU AnyReg:$rd, uimm7_opcode:$opcode, uimm20_lui:$imm20)>; def : InstAlias<".insn_j $opcode, $rd, $imm20", - (InsnJ AnyReg:$rd, uimm7:$opcode, simm21_lsb0_jal:$imm20)>; + (InsnJ AnyReg:$rd, uimm7_opcode:$opcode, simm21_lsb0_jal:$imm20)>; // Accept uj as an alias for j. def : InstAlias<".insn_uj $opcode, $rd, $imm20", - (InsnJ AnyReg:$rd, uimm7:$opcode, simm21_lsb0_jal:$imm20)>; + (InsnJ AnyReg:$rd, uimm7_opcode:$opcode, simm21_lsb0_jal:$imm20)>; def : InstAlias<".insn_s $opcode, $funct3, $rs2, ${imm12}(${rs1})", - (InsnS uimm7:$opcode, uimm3:$funct3, AnyReg:$rs2, + (InsnS uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2, AnyReg:$rs1, simm12:$imm12)>; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 2cd011a02345..d6c31c4804db 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -26,41 +26,6 @@ def RISCVBuildPairF64 : SDNode<"RISCVISD::BuildPairF64", SDT_RISCVBuildPairF64>; def RISCVSplitF64 : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>; //===----------------------------------------------------------------------===// -// Instruction Class Templates -//===----------------------------------------------------------------------===// - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPFMAD_rrr_frm<RISCVOpcode opcode, string opcodestr> - : RVInstR4Frm<0b01, opcode, (outs FPR64:$rd), - (ins FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, frmarg:$funct3), - opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">; - -class FPFMADDynFrmAlias<FPFMAD_rrr_frm Inst, string OpcodeStr> - : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3", - (Inst FPR64:$rd, FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>; - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPALUD_rr<bits<7> funct7, bits<3> funct3, string opcodestr> - : RVInstR<funct7, funct3, OPC_OP_FP, (outs FPR64:$rd), - (ins FPR64:$rs1, FPR64:$rs2), opcodestr, "$rd, $rs1, $rs2">; - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPALUD_rr_frm<bits<7> funct7, string opcodestr> - : RVInstRFrm<funct7, OPC_OP_FP, (outs FPR64:$rd), - (ins FPR64:$rs1, FPR64:$rs2, frmarg:$funct3), opcodestr, - "$rd, $rs1, $rs2, $funct3">; - -class FPALUDDynFrmAlias<FPALUD_rr_frm Inst, string OpcodeStr> - : InstAlias<OpcodeStr#" $rd, $rs1, $rs2", - (Inst FPR64:$rd, FPR64:$rs1, FPR64:$rs2, 0b111)>; - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPCmpD_rr<bits<3> funct3, string opcodestr> - : RVInstR<0b1010001, funct3, OPC_OP_FP, (outs GPR:$rd), - (ins FPR64:$rs1, FPR64:$rs2), opcodestr, "$rd, $rs1, $rs2">, - Sched<[WriteFCmp64, ReadFCmp64, ReadFCmp64]>; - -//===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -81,126 +46,104 @@ def FSD : RVInstS<0b011, OPC_STORE_FP, (outs), "fsd", "$rs2, ${imm12}(${rs1})">, Sched<[WriteFST64, ReadStoreData, ReadFMemBase]>; -def FMADD_D : FPFMAD_rrr_frm<OPC_MADD, "fmadd.d">, - Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>; -def : FPFMADDynFrmAlias<FMADD_D, "fmadd.d">; -def FMSUB_D : FPFMAD_rrr_frm<OPC_MSUB, "fmsub.d">, - Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>; -def : FPFMADDynFrmAlias<FMSUB_D, "fmsub.d">; -def FNMSUB_D : FPFMAD_rrr_frm<OPC_NMSUB, "fnmsub.d">, - Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>; -def : FPFMADDynFrmAlias<FNMSUB_D, "fnmsub.d">; -def FNMADD_D : FPFMAD_rrr_frm<OPC_NMADD, "fnmadd.d">, - Sched<[WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64]>; -def : FPFMADDynFrmAlias<FNMADD_D, "fnmadd.d">; +let SchedRW = [WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64] in { +def FMADD_D : FPFMA_rrr_frm<OPC_MADD, 0b01, "fmadd.d", FPR64>; +def FMSUB_D : FPFMA_rrr_frm<OPC_MSUB, 0b01, "fmsub.d", FPR64>; +def FNMSUB_D : FPFMA_rrr_frm<OPC_NMSUB, 0b01, "fnmsub.d", FPR64>; +def FNMADD_D : FPFMA_rrr_frm<OPC_NMADD, 0b01, "fnmadd.d", FPR64>; +} + +def : FPFMADynFrmAlias<FMADD_D, "fmadd.d", FPR64>; +def : FPFMADynFrmAlias<FMSUB_D, "fmsub.d", FPR64>; +def : FPFMADynFrmAlias<FNMSUB_D, "fnmsub.d", FPR64>; +def : FPFMADynFrmAlias<FNMADD_D, "fnmadd.d", FPR64>; -def FADD_D : FPALUD_rr_frm<0b0000001, "fadd.d">, +def FADD_D : FPALU_rr_frm<0b0000001, "fadd.d", FPR64>, Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>; -def : FPALUDDynFrmAlias<FADD_D, "fadd.d">; -def FSUB_D : FPALUD_rr_frm<0b0000101, "fsub.d">, +def FSUB_D : FPALU_rr_frm<0b0000101, "fsub.d", FPR64>, Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>; -def : FPALUDDynFrmAlias<FSUB_D, "fsub.d">; -def FMUL_D : FPALUD_rr_frm<0b0001001, "fmul.d">, +def FMUL_D : FPALU_rr_frm<0b0001001, "fmul.d", FPR64>, Sched<[WriteFMul64, ReadFMul64, ReadFMul64]>; -def : FPALUDDynFrmAlias<FMUL_D, "fmul.d">; -def FDIV_D : FPALUD_rr_frm<0b0001101, "fdiv.d">, +def FDIV_D : FPALU_rr_frm<0b0001101, "fdiv.d", FPR64>, Sched<[WriteFDiv64, ReadFDiv64, ReadFDiv64]>; -def : FPALUDDynFrmAlias<FDIV_D, "fdiv.d">; -def FSQRT_D : FPUnaryOp_r_frm<0b0101101, FPR64, FPR64, "fsqrt.d">, - Sched<[WriteFSqrt64, ReadFSqrt64]> { - let rs2 = 0b00000; -} -def : FPUnaryOpDynFrmAlias<FSQRT_D, "fsqrt.d", FPR64, FPR64>; +def : FPALUDynFrmAlias<FADD_D, "fadd.d", FPR64>; +def : FPALUDynFrmAlias<FSUB_D, "fsub.d", FPR64>; +def : FPALUDynFrmAlias<FMUL_D, "fmul.d", FPR64>; +def : FPALUDynFrmAlias<FDIV_D, "fdiv.d", FPR64>; -def FSGNJ_D : FPALUD_rr<0b0010001, 0b000, "fsgnj.d">, - Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>; -def FSGNJN_D : FPALUD_rr<0b0010001, 0b001, "fsgnjn.d">, - Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>; -def FSGNJX_D : FPALUD_rr<0b0010001, 0b010, "fsgnjx.d">, - Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>; -def FMIN_D : FPALUD_rr<0b0010101, 0b000, "fmin.d">, - Sched<[WriteFMinMax64, ReadFMinMax64, ReadFMinMax64]>; -def FMAX_D : FPALUD_rr<0b0010101, 0b001, "fmax.d">, - Sched<[WriteFMinMax64, ReadFMinMax64, ReadFMinMax64]>; +def FSQRT_D : FPUnaryOp_r_frm<0b0101101, 0b00000, FPR64, FPR64, "fsqrt.d">, + Sched<[WriteFSqrt64, ReadFSqrt64]>; +def : FPUnaryOpDynFrmAlias<FSQRT_D, "fsqrt.d", FPR64, FPR64>; -def FCVT_S_D : FPUnaryOp_r_frm<0b0100000, FPR32, FPR64, "fcvt.s.d">, - Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]> { - let rs2 = 0b00001; +let SchedRW = [WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64], + mayRaiseFPException = 0 in { +def FSGNJ_D : FPALU_rr<0b0010001, 0b000, "fsgnj.d", FPR64>; +def FSGNJN_D : FPALU_rr<0b0010001, 0b001, "fsgnjn.d", FPR64>; +def FSGNJX_D : FPALU_rr<0b0010001, 0b010, "fsgnjx.d", FPR64>; } -def : FPUnaryOpDynFrmAlias<FCVT_S_D, "fcvt.s.d", FPR32, FPR64>; -def FCVT_D_S : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR32, "fcvt.d.s">, - Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]> { - let rs2 = 0b00000; +let SchedRW = [WriteFMinMax64, ReadFMinMax64, ReadFMinMax64] in { +def FMIN_D : FPALU_rr<0b0010101, 0b000, "fmin.d", FPR64>; +def FMAX_D : FPALU_rr<0b0010101, 0b001, "fmax.d", FPR64>; } -def FEQ_D : FPCmpD_rr<0b010, "feq.d">; -def FLT_D : FPCmpD_rr<0b001, "flt.d">; -def FLE_D : FPCmpD_rr<0b000, "fle.d">; +def FCVT_S_D : FPUnaryOp_r_frm<0b0100000, 0b00001, FPR32, FPR64, "fcvt.s.d">, + Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]>; +def : FPUnaryOpDynFrmAlias<FCVT_S_D, "fcvt.s.d", FPR32, FPR64>; + +def FCVT_D_S : FPUnaryOp_r<0b0100001, 0b00000, 0b000, FPR64, FPR32, "fcvt.d.s">, + Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]>; -def FCLASS_D : FPUnaryOp_r<0b1110001, 0b001, GPR, FPR64, "fclass.d">, - Sched<[WriteFClass64, ReadFClass64]> { - let rs2 = 0b00000; +let SchedRW = [WriteFCmp64, ReadFCmp64, ReadFCmp64] in { +def FEQ_D : FPCmp_rr<0b1010001, 0b010, "feq.d", FPR64>; +def FLT_D : FPCmp_rr<0b1010001, 0b001, "flt.d", FPR64>; +def FLE_D : FPCmp_rr<0b1010001, 0b000, "fle.d", FPR64>; } -def FCVT_W_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.w.d">, - Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]> { - let rs2 = 0b00000; -} +let mayRaiseFPException = 0 in +def FCLASS_D : FPUnaryOp_r<0b1110001, 0b00000, 0b001, GPR, FPR64, "fclass.d">, + Sched<[WriteFClass64, ReadFClass64]>; + +def FCVT_W_D : FPUnaryOp_r_frm<0b1100001, 0b00000, GPR, FPR64, "fcvt.w.d">, + Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>; def : FPUnaryOpDynFrmAlias<FCVT_W_D, "fcvt.w.d", GPR, FPR64>; -def FCVT_WU_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.wu.d">, - Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]> { - let rs2 = 0b00001; -} +def FCVT_WU_D : FPUnaryOp_r_frm<0b1100001, 0b00001, GPR, FPR64, "fcvt.wu.d">, + Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]>; def : FPUnaryOpDynFrmAlias<FCVT_WU_D, "fcvt.wu.d", GPR, FPR64>; -def FCVT_D_W : FPUnaryOp_r<0b1101001, 0b000, FPR64, GPR, "fcvt.d.w">, - Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]> { - let rs2 = 0b00000; -} +def FCVT_D_W : FPUnaryOp_r<0b1101001, 0b00000, 0b000, FPR64, GPR, "fcvt.d.w">, + Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>; -def FCVT_D_WU : FPUnaryOp_r<0b1101001, 0b000, FPR64, GPR, "fcvt.d.wu">, - Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]> { - let rs2 = 0b00001; -} +def FCVT_D_WU : FPUnaryOp_r<0b1101001, 0b00001, 0b000, FPR64, GPR, "fcvt.d.wu">, + Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]>; } // Predicates = [HasStdExtD] let Predicates = [HasStdExtD, IsRV64] in { -def FCVT_L_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.l.d">, - Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]> { - let rs2 = 0b00010; -} +def FCVT_L_D : FPUnaryOp_r_frm<0b1100001, 0b00010, GPR, FPR64, "fcvt.l.d">, + Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>; def : FPUnaryOpDynFrmAlias<FCVT_L_D, "fcvt.l.d", GPR, FPR64>; -def FCVT_LU_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.lu.d">, - Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]> { - let rs2 = 0b00011; -} +def FCVT_LU_D : FPUnaryOp_r_frm<0b1100001, 0b00011, GPR, FPR64, "fcvt.lu.d">, + Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]>; def : FPUnaryOpDynFrmAlias<FCVT_LU_D, "fcvt.lu.d", GPR, FPR64>; -def FMV_X_D : FPUnaryOp_r<0b1110001, 0b000, GPR, FPR64, "fmv.x.d">, - Sched<[WriteFMovF64ToI64, ReadFMovF64ToI64]> { - let rs2 = 0b00000; -} +let mayRaiseFPException = 0 in +def FMV_X_D : FPUnaryOp_r<0b1110001, 0b00000, 0b000, GPR, FPR64, "fmv.x.d">, + Sched<[WriteFMovF64ToI64, ReadFMovF64ToI64]>; -def FCVT_D_L : FPUnaryOp_r_frm<0b1101001, FPR64, GPR, "fcvt.d.l">, - Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]> { - let rs2 = 0b00010; -} +def FCVT_D_L : FPUnaryOp_r_frm<0b1101001, 0b00010, FPR64, GPR, "fcvt.d.l">, + Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>; def : FPUnaryOpDynFrmAlias<FCVT_D_L, "fcvt.d.l", FPR64, GPR>; -def FCVT_D_LU : FPUnaryOp_r_frm<0b1101001, FPR64, GPR, "fcvt.d.lu">, - Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]> { - let rs2 = 0b00011; -} +def FCVT_D_LU : FPUnaryOp_r_frm<0b1101001, 0b00011, FPR64, GPR, "fcvt.d.lu">, + Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]>; def : FPUnaryOpDynFrmAlias<FCVT_D_LU, "fcvt.d.lu", FPR64, GPR>; -def FMV_D_X : FPUnaryOp_r<0b1111001, 0b000, FPR64, GPR, "fmv.d.x">, - Sched<[WriteFMovI64ToF64, ReadFMovI64ToF64]> { - let rs2 = 0b00000; -} +let mayRaiseFPException = 0 in +def FMV_D_X : FPUnaryOp_r<0b1111001, 0b00000, 0b000, FPR64, GPR, "fmv.d.x">, + Sched<[WriteFMovI64ToF64, ReadFMovI64ToF64]>; } // Predicates = [HasStdExtD, IsRV64] //===----------------------------------------------------------------------===// @@ -241,20 +184,20 @@ let Predicates = [HasStdExtD] in { /// Float conversion operations // f64 -> f32, f32 -> f64 -def : Pat<(fpround FPR64:$rs1), (FCVT_S_D FPR64:$rs1, 0b111)>; -def : Pat<(fpextend FPR32:$rs1), (FCVT_D_S FPR32:$rs1)>; +def : Pat<(any_fpround FPR64:$rs1), (FCVT_S_D FPR64:$rs1, 0b111)>; +def : Pat<(any_fpextend FPR32:$rs1), (FCVT_D_S FPR32:$rs1)>; // [u]int<->double conversion patterns must be gated on IsRV32 or IsRV64, so // are defined later. /// Float arithmetic operations -def : PatFpr64Fpr64DynFrm<fadd, FADD_D>; -def : PatFpr64Fpr64DynFrm<fsub, FSUB_D>; -def : PatFpr64Fpr64DynFrm<fmul, FMUL_D>; -def : PatFpr64Fpr64DynFrm<fdiv, FDIV_D>; +def : PatFpr64Fpr64DynFrm<any_fadd, FADD_D>; +def : PatFpr64Fpr64DynFrm<any_fsub, FSUB_D>; +def : PatFpr64Fpr64DynFrm<any_fmul, FMUL_D>; +def : PatFpr64Fpr64DynFrm<any_fdiv, FDIV_D>; -def : Pat<(fsqrt FPR64:$rs1), (FSQRT_D FPR64:$rs1, 0b111)>; +def : Pat<(any_fsqrt FPR64:$rs1), (FSQRT_D FPR64:$rs1, 0b111)>; def : Pat<(fneg FPR64:$rs1), (FSGNJN_D $rs1, $rs1)>; def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>; @@ -266,19 +209,19 @@ def : Pat<(fcopysign FPR32:$rs1, FPR64:$rs2), (FSGNJ_S $rs1, (FCVT_S_D $rs2, 0b111))>; // fmadd: rs1 * rs2 + rs3 -def : Pat<(fma FPR64:$rs1, FPR64:$rs2, FPR64:$rs3), +def : Pat<(any_fma FPR64:$rs1, FPR64:$rs2, FPR64:$rs3), (FMADD_D $rs1, $rs2, $rs3, 0b111)>; // fmsub: rs1 * rs2 - rs3 -def : Pat<(fma FPR64:$rs1, FPR64:$rs2, (fneg FPR64:$rs3)), +def : Pat<(any_fma FPR64:$rs1, FPR64:$rs2, (fneg FPR64:$rs3)), (FMSUB_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>; // fnmsub: -rs1 * rs2 + rs3 -def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, FPR64:$rs3), +def : Pat<(any_fma (fneg FPR64:$rs1), FPR64:$rs2, FPR64:$rs3), (FNMSUB_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>; // fnmadd: -rs1 * rs2 - rs3 -def : Pat<(fma (fneg FPR64:$rs1), FPR64:$rs2, (fneg FPR64:$rs3)), +def : Pat<(any_fma (fneg FPR64:$rs1), FPR64:$rs2, (fneg FPR64:$rs3)), (FNMADD_D FPR64:$rs1, FPR64:$rs2, FPR64:$rs3, 0b111)>; // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches @@ -328,8 +271,8 @@ let Predicates = [HasStdExtD, IsRV32] in { def : Pat<(f64 (fpimm0)), (FCVT_D_W (i32 X0))>; // double->[u]int. Round-to-zero must be used. -def : Pat<(i32 (fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, 0b001)>; -def : Pat<(i32 (fp_to_uint FPR64:$rs1)), (FCVT_WU_D FPR64:$rs1, 0b001)>; +def : Pat<(i32 (any_fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, 0b001)>; +def : Pat<(i32 (any_fp_to_uint FPR64:$rs1)), (FCVT_WU_D FPR64:$rs1, 0b001)>; // Saturating double->[u]int32. def : Pat<(i32 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_W_D $rs1, 0b001)>; @@ -342,8 +285,8 @@ def : Pat<(i32 (lrint FPR64:$rs1)), (FCVT_W_D $rs1, 0b111)>; def : Pat<(i32 (lround FPR64:$rs1)), (FCVT_W_D $rs1, 0b100)>; // [u]int->double. -def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_D_W GPR:$rs1)>; -def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_D_WU GPR:$rs1)>; +def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_D_W GPR:$rs1)>; +def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_D_WU GPR:$rs1)>; } // Predicates = [HasStdExtD, IsRV32] let Predicates = [HasStdExtD, IsRV64] in { @@ -358,20 +301,20 @@ def : Pat<(i64 (bitconvert FPR64:$rs1)), (FMV_X_D FPR64:$rs1)>; // Use target specific isd nodes to help us remember the result is sign // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be // duplicated if it has another user that didn't need the sign_extend. -def : Pat<(riscv_fcvt_w_rtz_rv64 FPR64:$rs1), (FCVT_W_D $rs1, 0b001)>; -def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR64:$rs1), (FCVT_W_D $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>; // [u]int32->fp -def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_D_W $rs1)>; -def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_D_WU $rs1)>; +def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_D_W $rs1)>; +def : Pat<(any_uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_D_WU $rs1)>; // Saturating double->[u]int64. def : Pat<(i64 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_L_D $rs1, 0b001)>; def : Pat<(i64 (riscv_fcvt_xu_rtz FPR64:$rs1)), (FCVT_LU_D $rs1, 0b001)>; // double->[u]int64. Round-to-zero must be used. -def : Pat<(i64 (fp_to_sint FPR64:$rs1)), (FCVT_L_D FPR64:$rs1, 0b001)>; -def : Pat<(i64 (fp_to_uint FPR64:$rs1)), (FCVT_LU_D FPR64:$rs1, 0b001)>; +def : Pat<(i64 (any_fp_to_sint FPR64:$rs1)), (FCVT_L_D FPR64:$rs1, 0b001)>; +def : Pat<(i64 (any_fp_to_uint FPR64:$rs1)), (FCVT_LU_D FPR64:$rs1, 0b001)>; // double->int64 with current rounding mode. def : Pat<(i64 (lrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>; @@ -382,6 +325,6 @@ def : Pat<(i64 (lround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>; def : Pat<(i64 (llround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>; // [u]int64->fp. Match GCC and default to using dynamic rounding mode. -def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_D_L GPR:$rs1, 0b111)>; -def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_D_LU GPR:$rs1, 0b111)>; +def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_D_L GPR:$rs1, 0b111)>; +def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_D_LU GPR:$rs1, 0b111)>; } // Predicates = [HasStdExtD, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 3400c3be52bf..bb45ed859442 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -19,9 +19,9 @@ def SDT_RISCVFMV_W_X_RV64 : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i64>]>; def SDT_RISCVFMV_X_ANYEXTW_RV64 : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>; -def STD_RISCVFCVT_W_RV64 +def SDT_RISCVFCVT_W_RV64 : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisFP<1>]>; -def STD_RISCVFCVT_X +def SDT_RISCVFCVT_X : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisFP<1>]>; def riscv_fmv_w_x_rv64 @@ -29,13 +29,27 @@ def riscv_fmv_w_x_rv64 def riscv_fmv_x_anyextw_rv64 : SDNode<"RISCVISD::FMV_X_ANYEXTW_RV64", SDT_RISCVFMV_X_ANYEXTW_RV64>; def riscv_fcvt_w_rtz_rv64 - : SDNode<"RISCVISD::FCVT_W_RTZ_RV64", STD_RISCVFCVT_W_RV64>; + : SDNode<"RISCVISD::FCVT_W_RTZ_RV64", SDT_RISCVFCVT_W_RV64>; def riscv_fcvt_wu_rtz_rv64 - : SDNode<"RISCVISD::FCVT_WU_RTZ_RV64", STD_RISCVFCVT_W_RV64>; + : SDNode<"RISCVISD::FCVT_WU_RTZ_RV64", SDT_RISCVFCVT_W_RV64>; def riscv_fcvt_x_rtz - : SDNode<"RISCVISD::FCVT_X_RTZ", STD_RISCVFCVT_X>; + : SDNode<"RISCVISD::FCVT_X_RTZ", SDT_RISCVFCVT_X>; def riscv_fcvt_xu_rtz - : SDNode<"RISCVISD::FCVT_XU_RTZ", STD_RISCVFCVT_X>; + : SDNode<"RISCVISD::FCVT_XU_RTZ", SDT_RISCVFCVT_X>; + +def riscv_strict_fcvt_w_rtz_rv64 + : SDNode<"RISCVISD::STRICT_FCVT_W_RTZ_RV64", SDT_RISCVFCVT_W_RV64, + [SDNPHasChain]>; +def riscv_strict_fcvt_wu_rtz_rv64 + : SDNode<"RISCVISD::STRICT_FCVT_WU_RTZ_RV64", SDT_RISCVFCVT_W_RV64, + [SDNPHasChain]>; + +def riscv_any_fcvt_w_rtz_rv64 : PatFrags<(ops node:$src), + [(riscv_strict_fcvt_w_rtz_rv64 node:$src), + (riscv_fcvt_w_rtz_rv64 node:$src)]>; +def riscv_any_fcvt_wu_rtz_rv64 : PatFrags<(ops node:$src), + [(riscv_strict_fcvt_wu_rtz_rv64 node:$src), + (riscv_fcvt_wu_rtz_rv64 node:$src)]>; //===----------------------------------------------------------------------===// // Operand and SDNode transformation definitions. @@ -59,54 +73,65 @@ def frmarg : Operand<XLenVT> { // Instruction class templates //===----------------------------------------------------------------------===// -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPFMAS_rrr_frm<RISCVOpcode opcode, string opcodestr> - : RVInstR4Frm<0b00, opcode, (outs FPR32:$rd), - (ins FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, frmarg:$funct3), - opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">; +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1, + UseNamedOperandTable = 1, hasPostISelHook = 1 in +class FPFMA_rrr_frm<RISCVOpcode opcode, bits<2> funct2, string opcodestr, + RegisterClass rty> + : RVInstR4Frm<funct2, opcode, (outs rty:$rd), + (ins rty:$rs1, rty:$rs2, rty:$rs3, frmarg:$frm), + opcodestr, "$rd, $rs1, $rs2, $rs3, $frm">; -class FPFMASDynFrmAlias<FPFMAS_rrr_frm Inst, string OpcodeStr> +class FPFMADynFrmAlias<FPFMA_rrr_frm Inst, string OpcodeStr, + RegisterClass rty> : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3", - (Inst FPR32:$rd, FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>; + (Inst rty:$rd, rty:$rs1, rty:$rs2, rty:$rs3, 0b111)>; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPALUS_rr<bits<7> funct7, bits<3> funct3, string opcodestr> - : RVInstR<funct7, funct3, OPC_OP_FP, (outs FPR32:$rd), - (ins FPR32:$rs1, FPR32:$rs2), opcodestr, "$rd, $rs1, $rs2">; +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in +class FPALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr, + RegisterClass rty> + : RVInstR<funct7, funct3, OPC_OP_FP, (outs rty:$rd), + (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPALUS_rr_frm<bits<7> funct7, string opcodestr> - : RVInstRFrm<funct7, OPC_OP_FP, (outs FPR32:$rd), - (ins FPR32:$rs1, FPR32:$rs2, frmarg:$funct3), opcodestr, - "$rd, $rs1, $rs2, $funct3">; +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1, + UseNamedOperandTable = 1, hasPostISelHook = 1 in +class FPALU_rr_frm<bits<7> funct7, string opcodestr, RegisterClass rty> + : RVInstRFrm<funct7, OPC_OP_FP, (outs rty:$rd), + (ins rty:$rs1, rty:$rs2, frmarg:$frm), opcodestr, + "$rd, $rs1, $rs2, $frm">; -class FPALUSDynFrmAlias<FPALUS_rr_frm Inst, string OpcodeStr> +class FPALUDynFrmAlias<FPALU_rr_frm Inst, string OpcodeStr, + RegisterClass rty> : InstAlias<OpcodeStr#" $rd, $rs1, $rs2", - (Inst FPR32:$rd, FPR32:$rs1, FPR32:$rs2, 0b111)>; + (Inst rty:$rd, rty:$rs1, rty:$rs2, 0b111)>; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPUnaryOp_r<bits<7> funct7, bits<3> funct3, RegisterClass rdty, - RegisterClass rs1ty, string opcodestr> +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in +class FPUnaryOp_r<bits<7> funct7, bits<5> rs2val, bits<3> funct3, + RegisterClass rdty, RegisterClass rs1ty, string opcodestr> : RVInstR<funct7, funct3, OPC_OP_FP, (outs rdty:$rd), (ins rs1ty:$rs1), - opcodestr, "$rd, $rs1">; + opcodestr, "$rd, $rs1"> { + let rs2 = rs2val; +} -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPUnaryOp_r_frm<bits<7> funct7, RegisterClass rdty, RegisterClass rs1ty, - string opcodestr> +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1, + UseNamedOperandTable = 1, hasPostISelHook = 1 in +class FPUnaryOp_r_frm<bits<7> funct7, bits<5> rs2val, RegisterClass rdty, + RegisterClass rs1ty, string opcodestr> : RVInstRFrm<funct7, OPC_OP_FP, (outs rdty:$rd), - (ins rs1ty:$rs1, frmarg:$funct3), opcodestr, - "$rd, $rs1, $funct3">; + (ins rs1ty:$rs1, frmarg:$frm), opcodestr, + "$rd, $rs1, $frm"> { + let rs2 = rs2val; +} class FPUnaryOpDynFrmAlias<FPUnaryOp_r_frm Inst, string OpcodeStr, RegisterClass rdty, RegisterClass rs1ty> : InstAlias<OpcodeStr#" $rd, $rs1", (Inst rdty:$rd, rs1ty:$rs1, 0b111)>; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPCmpS_rr<bits<3> funct3, string opcodestr> - : RVInstR<0b1010000, funct3, OPC_OP_FP, (outs GPR:$rd), - (ins FPR32:$rs1, FPR32:$rs2), opcodestr, "$rd, $rs1, $rs2">, - Sched<[WriteFCmp32, ReadFCmp32, ReadFCmp32]>; +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1 in +class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr, + RegisterClass rty> + : RVInstR<funct7, funct3, OPC_OP_FP, (outs GPR:$rd), + (ins rty:$rs1, rty:$rs2), opcodestr, "$rd, $rs1, $rs2">; //===----------------------------------------------------------------------===// // Instructions @@ -128,116 +153,98 @@ def FSW : RVInstS<0b010, OPC_STORE_FP, (outs), "fsw", "$rs2, ${imm12}(${rs1})">, Sched<[WriteFST32, ReadStoreData, ReadFMemBase]>; -def FMADD_S : FPFMAS_rrr_frm<OPC_MADD, "fmadd.s">, - Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>; -def : FPFMASDynFrmAlias<FMADD_S, "fmadd.s">; -def FMSUB_S : FPFMAS_rrr_frm<OPC_MSUB, "fmsub.s">, - Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>; -def : FPFMASDynFrmAlias<FMSUB_S, "fmsub.s">; -def FNMSUB_S : FPFMAS_rrr_frm<OPC_NMSUB, "fnmsub.s">, - Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>; -def : FPFMASDynFrmAlias<FNMSUB_S, "fnmsub.s">; -def FNMADD_S : FPFMAS_rrr_frm<OPC_NMADD, "fnmadd.s">, - Sched<[WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32]>; -def : FPFMASDynFrmAlias<FNMADD_S, "fnmadd.s">; +let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in { +def FMADD_S : FPFMA_rrr_frm<OPC_MADD, 0b00, "fmadd.s", FPR32>; +def FMSUB_S : FPFMA_rrr_frm<OPC_MSUB, 0b00, "fmsub.s", FPR32>; +def FNMSUB_S : FPFMA_rrr_frm<OPC_NMSUB, 0b00, "fnmsub.s", FPR32>; +def FNMADD_S : FPFMA_rrr_frm<OPC_NMADD, 0b00, "fnmadd.s", FPR32>; +} + +def : FPFMADynFrmAlias<FMADD_S, "fmadd.s", FPR32>; +def : FPFMADynFrmAlias<FMSUB_S, "fmsub.s", FPR32>; +def : FPFMADynFrmAlias<FNMSUB_S, "fnmsub.s", FPR32>; +def : FPFMADynFrmAlias<FNMADD_S, "fnmadd.s", FPR32>; -def FADD_S : FPALUS_rr_frm<0b0000000, "fadd.s">, +def FADD_S : FPALU_rr_frm<0b0000000, "fadd.s", FPR32>, Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>; -def : FPALUSDynFrmAlias<FADD_S, "fadd.s">; -def FSUB_S : FPALUS_rr_frm<0b0000100, "fsub.s">, +def FSUB_S : FPALU_rr_frm<0b0000100, "fsub.s", FPR32>, Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>; -def : FPALUSDynFrmAlias<FSUB_S, "fsub.s">; -def FMUL_S : FPALUS_rr_frm<0b0001000, "fmul.s">, +def FMUL_S : FPALU_rr_frm<0b0001000, "fmul.s", FPR32>, Sched<[WriteFMul32, ReadFMul32, ReadFMul32]>; -def : FPALUSDynFrmAlias<FMUL_S, "fmul.s">; -def FDIV_S : FPALUS_rr_frm<0b0001100, "fdiv.s">, +def FDIV_S : FPALU_rr_frm<0b0001100, "fdiv.s", FPR32>, Sched<[WriteFDiv32, ReadFDiv32, ReadFDiv32]>; -def : FPALUSDynFrmAlias<FDIV_S, "fdiv.s">; -def FSQRT_S : FPUnaryOp_r_frm<0b0101100, FPR32, FPR32, "fsqrt.s">, - Sched<[WriteFSqrt32, ReadFSqrt32]> { - let rs2 = 0b00000; -} +def : FPALUDynFrmAlias<FADD_S, "fadd.s", FPR32>; +def : FPALUDynFrmAlias<FSUB_S, "fsub.s", FPR32>; +def : FPALUDynFrmAlias<FMUL_S, "fmul.s", FPR32>; +def : FPALUDynFrmAlias<FDIV_S, "fdiv.s", FPR32>; + +def FSQRT_S : FPUnaryOp_r_frm<0b0101100, 0b00000, FPR32, FPR32, "fsqrt.s">, + Sched<[WriteFSqrt32, ReadFSqrt32]>; def : FPUnaryOpDynFrmAlias<FSQRT_S, "fsqrt.s", FPR32, FPR32>; -def FSGNJ_S : FPALUS_rr<0b0010000, 0b000, "fsgnj.s">, - Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>; -def FSGNJN_S : FPALUS_rr<0b0010000, 0b001, "fsgnjn.s">, - Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>; -def FSGNJX_S : FPALUS_rr<0b0010000, 0b010, "fsgnjx.s">, - Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>; -def FMIN_S : FPALUS_rr<0b0010100, 0b000, "fmin.s">, - Sched<[WriteFMinMax32, ReadFMinMax32, ReadFMinMax32]>; -def FMAX_S : FPALUS_rr<0b0010100, 0b001, "fmax.s">, - Sched<[WriteFMinMax32, ReadFMinMax32, ReadFMinMax32]>; +let SchedRW = [WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32], + mayRaiseFPException = 0 in { +def FSGNJ_S : FPALU_rr<0b0010000, 0b000, "fsgnj.s", FPR32>; +def FSGNJN_S : FPALU_rr<0b0010000, 0b001, "fsgnjn.s", FPR32>; +def FSGNJX_S : FPALU_rr<0b0010000, 0b010, "fsgnjx.s", FPR32>; +} -def FCVT_W_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.w.s">, - Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]> { - let rs2 = 0b00000; +let SchedRW = [WriteFMinMax32, ReadFMinMax32, ReadFMinMax32] in { +def FMIN_S : FPALU_rr<0b0010100, 0b000, "fmin.s", FPR32>; +def FMAX_S : FPALU_rr<0b0010100, 0b001, "fmax.s", FPR32>; } + +def FCVT_W_S : FPUnaryOp_r_frm<0b1100000, 0b00000, GPR, FPR32, "fcvt.w.s">, + Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>; def : FPUnaryOpDynFrmAlias<FCVT_W_S, "fcvt.w.s", GPR, FPR32>; -def FCVT_WU_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.wu.s">, - Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]> { - let rs2 = 0b00001; -} +def FCVT_WU_S : FPUnaryOp_r_frm<0b1100000, 0b00001, GPR, FPR32, "fcvt.wu.s">, + Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]>; def : FPUnaryOpDynFrmAlias<FCVT_WU_S, "fcvt.wu.s", GPR, FPR32>; -def FMV_X_W : FPUnaryOp_r<0b1110000, 0b000, GPR, FPR32, "fmv.x.w">, - Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]> { - let rs2 = 0b00000; -} - -def FEQ_S : FPCmpS_rr<0b010, "feq.s">; -def FLT_S : FPCmpS_rr<0b001, "flt.s">; -def FLE_S : FPCmpS_rr<0b000, "fle.s">; +let mayRaiseFPException = 0 in +def FMV_X_W : FPUnaryOp_r<0b1110000, 0b00000, 0b000, GPR, FPR32, "fmv.x.w">, + Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]>; -def FCLASS_S : FPUnaryOp_r<0b1110000, 0b001, GPR, FPR32, "fclass.s">, - Sched<[WriteFClass32, ReadFClass32]> { - let rs2 = 0b00000; +let SchedRW = [WriteFCmp32, ReadFCmp32, ReadFCmp32] in { +def FEQ_S : FPCmp_rr<0b1010000, 0b010, "feq.s", FPR32>; +def FLT_S : FPCmp_rr<0b1010000, 0b001, "flt.s", FPR32>; +def FLE_S : FPCmp_rr<0b1010000, 0b000, "fle.s", FPR32>; } -def FCVT_S_W : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.w">, - Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]> { - let rs2 = 0b00000; -} +let mayRaiseFPException = 0 in +def FCLASS_S : FPUnaryOp_r<0b1110000, 0b00000, 0b001, GPR, FPR32, "fclass.s">, + Sched<[WriteFClass32, ReadFClass32]>; + +def FCVT_S_W : FPUnaryOp_r_frm<0b1101000, 0b00000, FPR32, GPR, "fcvt.s.w">, + Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>; def : FPUnaryOpDynFrmAlias<FCVT_S_W, "fcvt.s.w", FPR32, GPR>; -def FCVT_S_WU : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.wu">, - Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]> { - let rs2 = 0b00001; -} +def FCVT_S_WU : FPUnaryOp_r_frm<0b1101000, 0b00001, FPR32, GPR, "fcvt.s.wu">, + Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]>; def : FPUnaryOpDynFrmAlias<FCVT_S_WU, "fcvt.s.wu", FPR32, GPR>; -def FMV_W_X : FPUnaryOp_r<0b1111000, 0b000, FPR32, GPR, "fmv.w.x">, - Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]> { - let rs2 = 0b00000; -} +let mayRaiseFPException = 0 in +def FMV_W_X : FPUnaryOp_r<0b1111000, 0b00000, 0b000, FPR32, GPR, "fmv.w.x">, + Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]>; } // Predicates = [HasStdExtF] let Predicates = [HasStdExtF, IsRV64] in { -def FCVT_L_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.l.s">, - Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]> { - let rs2 = 0b00010; -} +def FCVT_L_S : FPUnaryOp_r_frm<0b1100000, 0b00010, GPR, FPR32, "fcvt.l.s">, + Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>; def : FPUnaryOpDynFrmAlias<FCVT_L_S, "fcvt.l.s", GPR, FPR32>; -def FCVT_LU_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.lu.s">, - Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]> { - let rs2 = 0b00011; -} +def FCVT_LU_S : FPUnaryOp_r_frm<0b1100000, 0b00011, GPR, FPR32, "fcvt.lu.s">, + Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]>; def : FPUnaryOpDynFrmAlias<FCVT_LU_S, "fcvt.lu.s", GPR, FPR32>; -def FCVT_S_L : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.l">, - Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]> { - let rs2 = 0b00010; -} +def FCVT_S_L : FPUnaryOp_r_frm<0b1101000, 0b00010, FPR32, GPR, "fcvt.s.l">, + Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>; def : FPUnaryOpDynFrmAlias<FCVT_S_L, "fcvt.s.l", FPR32, GPR>; -def FCVT_S_LU : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.lu">, - Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]> { - let rs2 = 0b00011; -} +def FCVT_S_LU : FPUnaryOp_r_frm<0b1101000, 0b00011, FPR32, GPR, "fcvt.s.lu">, + Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]>; def : FPUnaryOpDynFrmAlias<FCVT_S_LU, "fcvt.s.lu", FPR32, GPR>; } // Predicates = [HasStdExtF, IsRV64] @@ -320,12 +327,12 @@ def : Pat<(f32 (fpimm0)), (FMV_W_X X0)>; /// Float arithmetic operations -def : PatFpr32Fpr32DynFrm<fadd, FADD_S>; -def : PatFpr32Fpr32DynFrm<fsub, FSUB_S>; -def : PatFpr32Fpr32DynFrm<fmul, FMUL_S>; -def : PatFpr32Fpr32DynFrm<fdiv, FDIV_S>; +def : PatFpr32Fpr32DynFrm<any_fadd, FADD_S>; +def : PatFpr32Fpr32DynFrm<any_fsub, FSUB_S>; +def : PatFpr32Fpr32DynFrm<any_fmul, FMUL_S>; +def : PatFpr32Fpr32DynFrm<any_fdiv, FDIV_S>; -def : Pat<(fsqrt FPR32:$rs1), (FSQRT_S FPR32:$rs1, 0b111)>; +def : Pat<(any_fsqrt FPR32:$rs1), (FSQRT_S FPR32:$rs1, 0b111)>; def : Pat<(fneg FPR32:$rs1), (FSGNJN_S $rs1, $rs1)>; def : Pat<(fabs FPR32:$rs1), (FSGNJX_S $rs1, $rs1)>; @@ -334,19 +341,19 @@ def : PatFpr32Fpr32<fcopysign, FSGNJ_S>; def : Pat<(fcopysign FPR32:$rs1, (fneg FPR32:$rs2)), (FSGNJN_S $rs1, $rs2)>; // fmadd: rs1 * rs2 + rs3 -def : Pat<(fma FPR32:$rs1, FPR32:$rs2, FPR32:$rs3), +def : Pat<(any_fma FPR32:$rs1, FPR32:$rs2, FPR32:$rs3), (FMADD_S $rs1, $rs2, $rs3, 0b111)>; // fmsub: rs1 * rs2 - rs3 -def : Pat<(fma FPR32:$rs1, FPR32:$rs2, (fneg FPR32:$rs3)), +def : Pat<(any_fma FPR32:$rs1, FPR32:$rs2, (fneg FPR32:$rs3)), (FMSUB_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>; // fnmsub: -rs1 * rs2 + rs3 -def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, FPR32:$rs3), +def : Pat<(any_fma (fneg FPR32:$rs1), FPR32:$rs2, FPR32:$rs3), (FNMSUB_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>; // fnmadd: -rs1 * rs2 - rs3 -def : Pat<(fma (fneg FPR32:$rs1), FPR32:$rs2, (fneg FPR32:$rs3)), +def : Pat<(any_fma (fneg FPR32:$rs1), FPR32:$rs2, (fneg FPR32:$rs3)), (FNMADD_S FPR32:$rs1, FPR32:$rs2, FPR32:$rs3, 0b111)>; // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches @@ -382,8 +389,8 @@ def : Pat<(bitconvert (i32 GPR:$rs1)), (FMV_W_X GPR:$rs1)>; def : Pat<(i32 (bitconvert FPR32:$rs1)), (FMV_X_W FPR32:$rs1)>; // float->[u]int. Round-to-zero must be used. -def : Pat<(i32 (fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>; -def : Pat<(i32 (fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>; +def : Pat<(i32 (any_fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>; +def : Pat<(i32 (any_fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>; // Saturating float->[u]int32. def : Pat<(i32 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>; @@ -396,8 +403,8 @@ def : Pat<(i32 (lrint FPR32:$rs1)), (FCVT_W_S $rs1, 0b111)>; def : Pat<(i32 (lround FPR32:$rs1)), (FCVT_W_S $rs1, 0b100)>; // [u]int->float. Match GCC and default to using dynamic rounding mode. -def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>; -def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>; +def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>; +def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_S_WU $rs1, 0b111)>; } // Predicates = [HasStdExtF, IsRV32] let Predicates = [HasStdExtF, IsRV64] in { @@ -410,12 +417,12 @@ def : Pat<(sext_inreg (riscv_fmv_x_anyextw_rv64 FPR32:$src), i32), // Use target specific isd nodes to help us remember the result is sign // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be // duplicated if it has another user that didn't need the sign_extend. -def : Pat<(riscv_fcvt_w_rtz_rv64 FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>; -def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>; // float->[u]int64. Round-to-zero must be used. -def : Pat<(i64 (fp_to_sint FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>; -def : Pat<(i64 (fp_to_uint FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>; +def : Pat<(i64 (any_fp_to_sint FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>; +def : Pat<(i64 (any_fp_to_uint FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>; // Saturating float->[u]int64. def : Pat<(i64 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>; @@ -430,8 +437,8 @@ def : Pat<(i64 (lround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>; def : Pat<(i64 (llround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>; // [u]int->fp. Match GCC and default to using dynamic rounding mode. -def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_S_W $rs1, 0b111)>; -def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_S_WU $rs1, 0b111)>; -def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_S_L $rs1, 0b111)>; -def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_S_LU $rs1, 0b111)>; +def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_S_W $rs1, 0b111)>; +def : Pat<(any_uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_S_WU $rs1, 0b111)>; +def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_S_L $rs1, 0b111)>; +def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_S_LU $rs1, 0b111)>; } // Predicates = [HasStdExtF, IsRV64] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td index a037dbf585ce..b62e23d3b0fa 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td @@ -96,14 +96,6 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))), (REMW GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtM, IsRV64] -// Pattern to detect constants with no more than 32 active bits that can't -// be materialized with lui+addiw. -def uimm32_not_simm32 : PatLeaf<(XLenVT GPR:$a), [{ - auto *C = dyn_cast<ConstantSDNode>(N); - return C && C->hasOneUse() && isUInt<32>(C->getZExtValue()) && - !isInt<32>(C->getSExtValue()); -}]>; - let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in { // Special case for calculating the full 64-bit product of a 32x32 unsigned // multiply where the inputs aren't known to be zero extended. We can shift the @@ -111,9 +103,4 @@ let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in { // zeroing the upper 32 bits. def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))), (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>; -// The RHS could also be a constant that is hard to materialize. By shifting -// left we can allow constant materialization to use LUI+ADDIW via -// hasAllWUsers. -def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), uimm32_not_simm32:$rs2)), - (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>; } // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td index 3d5f9bc54731..173ae43a08d6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td @@ -338,29 +338,6 @@ class VALUVs2<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, string opcodestr> opcodestr, "$vd, $vs2$vm">; } // hasSideEffects = 0, mayLoad = 0, mayStore = 0 -let hasSideEffects = 0, mayLoad = 1, mayStore = 1 in { -// vamo vd, (rs1), vs2, vd, vm -class VAMOWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> - : RVInstVAMO<amoop, width.Value{2-0}, (outs VR:$vd_wd), - (ins GPR:$rs1, VR:$vs2, VR:$vd, VMaskOp:$vm), - opcodestr, "$vd_wd, (${rs1}), $vs2, $vd$vm"> { - let Constraints = "$vd_wd = $vd"; - let wd = 1; - bits<5> vd; - let Inst{11-7} = vd; -} - -// vamo x0, (rs1), vs2, vs3, vm -class VAMONoWd<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> - : RVInstVAMO<amoop, width.Value{2-0}, (outs), - (ins GPR:$rs1, VR:$vs2, VR:$vs3, VMaskOp:$vm), - opcodestr, "x0, (${rs1}), $vs2, $vs3$vm"> { - bits<5> vs3; - let Inst{11-7} = vs3; -} - -} // hasSideEffects = 0, mayLoad = 1, mayStore = 1 - //===----------------------------------------------------------------------===// // Combination of instruction classes. // Use these multiclasses to define instructions more easily. @@ -779,11 +756,6 @@ multiclass VCPR_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> { Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>; } -multiclass VAMO<RISCVAMOOP amoop, RISCVWidth width, string opcodestr> { - def _WD : VAMOWd<amoop, width, opcodestr>; - def _UNWD : VAMONoWd<amoop, width, opcodestr>; -} - multiclass VWholeLoadN<bits<3> nf, string opcodestr, RegisterClass VRC> { foreach l = [8, 16, 32, 64] in { defvar w = !cast<RISCVWidth>("LSWidth" # l); @@ -822,7 +794,7 @@ foreach eew = [8, 16, 32, 64] in { // Vector Strided Instructions def VLSE#eew#_V : VStridedLoad<w, "vlse"#eew#".v">, VLSSched<eew>; def VSSE#eew#_V : VStridedStore<w, "vsse"#eew#".v">, VSSSched<eew>; - + // Vector Indexed Instructions def VLUXEI#eew#_V : VIndexedLoad<MOPLDIndexedUnord, w, "vluxei"#eew#".v">, VLXSched<eew, "U">; @@ -1416,13 +1388,20 @@ defm VCOMPRESS_V : VCPR_MV_Mask<"vcompress", 0b010111>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0, RVVConstraint = NoConstraint in { -foreach n = [1, 2, 4, 8] in { - def VMV#n#R_V : RVInstV<0b100111, !add(n, -1), OPIVI, (outs VR:$vd), - (ins VR:$vs2), "vmv" # n # "r.v", "$vd, $vs2">, - VMVRSched<n> { +def VMV1R_V : RVInstV<0b100111, 0, OPIVI, (outs VR:$vd), (ins VR:$vs2), + "vmv1r.v", "$vd, $vs2">, VMVRSched<1> { let Uses = []; let vm = 1; } +// A future extension may relax the vector register alignment restrictions. +foreach n = [2, 4, 8] in { + defvar vrc = !cast<VReg>("VRM"#n); + def VMV#n#R_V : RVInstV<0b100111, !add(n, -1), OPIVI, (outs vrc:$vd), + (ins vrc:$vs2), "vmv" # n # "r.v", "$vd, $vs2">, + VMVRSched<n> { + let Uses = []; + let vm = 1; + } } } // hasSideEffects = 0, mayLoad = 0, mayStore = 0 } // Predicates = [HasStdExtV] @@ -1462,31 +1441,4 @@ let Predicates = [HasStdExtZvlsseg] in { } } // Predicates = [HasStdExtZvlsseg] -let Predicates = [HasStdExtZvamo, HasStdExtA] in { - foreach eew = [8, 16, 32] in { - defvar w = !cast<RISCVWidth>("LSWidth"#eew); - defm VAMOSWAPEI#eew : VAMO<AMOOPVamoSwap, w, "vamoswapei"#eew#".v">; - defm VAMOADDEI#eew : VAMO<AMOOPVamoAdd, w, "vamoaddei"#eew#".v">; - defm VAMOXOREI#eew : VAMO<AMOOPVamoXor, w, "vamoxorei"#eew#".v">; - defm VAMOANDEI#eew : VAMO<AMOOPVamoAnd, w, "vamoandei"#eew#".v">; - defm VAMOOREI#eew : VAMO<AMOOPVamoOr, w, "vamoorei"#eew#".v">; - defm VAMOMINEI#eew : VAMO<AMOOPVamoMin, w, "vamominei"#eew#".v">; - defm VAMOMAXEI#eew : VAMO<AMOOPVamoMax, w, "vamomaxei"#eew#".v">; - defm VAMOMINUEI#eew : VAMO<AMOOPVamoMinu, w, "vamominuei"#eew#".v">; - defm VAMOMAXUEI#eew : VAMO<AMOOPVamoMaxu, w, "vamomaxuei"#eew#".v">; - } -} // Predicates = [HasStdExtZvamo, HasStdExtA] - -let Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64] in { - defm VAMOSWAPEI64 : VAMO<AMOOPVamoSwap, LSWidth64, "vamoswapei64.v">; - defm VAMOADDEI64 : VAMO<AMOOPVamoAdd, LSWidth64, "vamoaddei64.v">; - defm VAMOXOREI64 : VAMO<AMOOPVamoXor, LSWidth64, "vamoxorei64.v">; - defm VAMOANDEI64 : VAMO<AMOOPVamoAnd, LSWidth64, "vamoandei64.v">; - defm VAMOOREI64 : VAMO<AMOOPVamoOr, LSWidth64, "vamoorei64.v">; - defm VAMOMINEI64 : VAMO<AMOOPVamoMin, LSWidth64, "vamominei64.v">; - defm VAMOMAXEI64 : VAMO<AMOOPVamoMax, LSWidth64, "vamomaxei64.v">; - defm VAMOMINUEI64 : VAMO<AMOOPVamoMinu, LSWidth64, "vamominuei64.v">; - defm VAMOMAXUEI64 : VAMO<AMOOPVamoMaxu, LSWidth64, "vamomaxuei64.v">; -} // Predicates = [HasStdExtZvamo, HasStdExtA, IsRV64] - include "RISCVInstrInfoVPseudos.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index a82e333e6bab..073fa605e0fb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -1124,68 +1124,6 @@ class VPseudoTernaryNoMaskWithPolicy<VReg RetClass, let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); } -class VPseudoAMOWDNoMask<VReg RetClass, - VReg Op1Class> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd), - (ins GPR:$rs1, - Op1Class:$vs2, - GetVRegNoV0<RetClass>.R:$vd, - AVL:$vl, ixlenimm:$sew), []>, - RISCVVPseudo { - let mayLoad = 1; - let mayStore = 1; - let hasSideEffects = 1; - let Constraints = "$vd_wd = $vd"; - let HasVLOp = 1; - let HasSEWOp = 1; - let HasDummyMask = 1; - let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); -} - -class VPseudoAMOWDMask<VReg RetClass, - VReg Op1Class> : - Pseudo<(outs GetVRegNoV0<RetClass>.R:$vd_wd), - (ins GPR:$rs1, - Op1Class:$vs2, - GetVRegNoV0<RetClass>.R:$vd, - VMaskOp:$vm, AVL:$vl, ixlenimm:$sew), []>, - RISCVVPseudo { - let mayLoad = 1; - let mayStore = 1; - let hasSideEffects = 1; - let Constraints = "$vd_wd = $vd"; - let HasVLOp = 1; - let HasSEWOp = 1; - let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); -} - -multiclass VPseudoAMOEI<int eew> { - // Standard scalar AMO supports 32, 64, and 128 Mem data bits, - // and in the base vector "V" extension, only SEW up to ELEN = max(XLEN, FLEN) - // are required to be supported. - // therefore only [32, 64] is allowed here. - foreach sew = [32, 64] in { - foreach lmul = MxSet<sew>.m in { - defvar octuple_lmul = lmul.octuple; - // Calculate emul = eew * lmul / sew - defvar octuple_emul = !srl(!mul(eew, octuple_lmul), log2<sew>.val); - if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then { - defvar emulMX = octuple_to_str<octuple_emul>.ret; - defvar emul= !cast<LMULInfo>("V_" # emulMX); - let VLMul = lmul.value in { - def "_WD_" # lmul.MX # "_" # emulMX : VPseudoAMOWDNoMask<lmul.vrclass, emul.vrclass>; - def "_WD_" # lmul.MX # "_" # emulMX # "_MASK" : VPseudoAMOWDMask<lmul.vrclass, emul.vrclass>; - } - } - } - } -} - -multiclass VPseudoAMO { - foreach eew = EEWList in - defm "EI" # eew : VPseudoAMOEI<eew>; -} - class VPseudoUSSegLoadNoMask<VReg RetClass, int EEW, bits<4> NF, bit isFF>: Pseudo<(outs RetClass:$rd), (ins GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>, @@ -1376,17 +1314,35 @@ class VPseudoISegStoreMask<VReg ValClass, VReg IdxClass, int EEW, bits<3> LMUL, let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst); } -multiclass VPseudoUSLoad<bit isFF> { +multiclass VPseudoUSLoad { foreach eew = EEWList in { foreach lmul = MxSet<eew>.m in { defvar LInfo = lmul.MX; defvar vreg = lmul.vrclass; - defvar FFStr = !if(isFF, "FF", ""); let VLMul = lmul.value in { - def "E" # eew # FFStr # "_V_" # LInfo : - VPseudoUSLoadNoMask<vreg, eew, isFF>; - def "E" # eew # FFStr # "_V_" # LInfo # "_MASK" : - VPseudoUSLoadMask<vreg, eew, isFF>; + def "E" # eew # "_V_" # LInfo : + VPseudoUSLoadNoMask<vreg, eew, false>, + VLESched<eew>; + def "E" # eew # "_V_" # LInfo # "_MASK" : + VPseudoUSLoadMask<vreg, eew, false>, + VLESched<eew>; + } + } + } +} + +multiclass VPseudoFFLoad { + foreach eew = EEWList in { + foreach lmul = MxSet<eew>.m in { + defvar LInfo = lmul.MX; + defvar vreg = lmul.vrclass; + let VLMul = lmul.value in { + def "E" # eew # "FF_V_" # LInfo : + VPseudoUSLoadNoMask<vreg, eew, true>, + VLFSched<eew>; + def "E" # eew # "FF_V_" # LInfo # "_MASK" : + VPseudoUSLoadMask<vreg, eew, true>, + VLFSched<eew>; } } } @@ -1406,8 +1362,10 @@ multiclass VPseudoSLoad { defvar LInfo = lmul.MX; defvar vreg = lmul.vrclass; let VLMul = lmul.value in { - def "E" # eew # "_V_" # LInfo : VPseudoSLoadNoMask<vreg, eew>; - def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg, eew>; + def "E" # eew # "_V_" # LInfo : VPseudoSLoadNoMask<vreg, eew>, + VLSSched<eew>; + def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg, eew>, + VLSSched<eew>; } } } @@ -1427,11 +1385,14 @@ multiclass VPseudoILoad<bit Ordered> { defvar Vreg = lmul.vrclass; defvar IdxVreg = idx_lmul.vrclass; defvar HasConstraint = !ne(sew, eew); + defvar Order = !if(Ordered, "O", "U"); let VLMul = lmul.value in { def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo : - VPseudoILoadNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>; + VPseudoILoadNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>, + VLXSched<eew, Order>; def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : - VPseudoILoadMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>; + VPseudoILoadMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>, + VLXSched<eew, Order>; } } } @@ -1445,8 +1406,10 @@ multiclass VPseudoUSStore { defvar LInfo = lmul.MX; defvar vreg = lmul.vrclass; let VLMul = lmul.value in { - def "E" # eew # "_V_" # LInfo : VPseudoUSStoreNoMask<vreg, eew>; - def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSStoreMask<vreg, eew>; + def "E" # eew # "_V_" # LInfo : VPseudoUSStoreNoMask<vreg, eew>, + VSESched<eew>; + def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoUSStoreMask<vreg, eew>, + VSESched<eew>; } } } @@ -1466,8 +1429,10 @@ multiclass VPseudoSStore { defvar LInfo = lmul.MX; defvar vreg = lmul.vrclass; let VLMul = lmul.value in { - def "E" # eew # "_V_" # LInfo : VPseudoSStoreNoMask<vreg, eew>; - def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSStoreMask<vreg, eew>; + def "E" # eew # "_V_" # LInfo : VPseudoSStoreNoMask<vreg, eew>, + VSSSched<eew>; + def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSStoreMask<vreg, eew>, + VSSSched<eew>; } } } @@ -1486,11 +1451,14 @@ multiclass VPseudoIStore<bit Ordered> { defvar idx_lmul = !cast<LMULInfo>("V_" # IdxLInfo); defvar Vreg = lmul.vrclass; defvar IdxVreg = idx_lmul.vrclass; + defvar Order = !if(Ordered, "O", "U"); let VLMul = lmul.value in { def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo : - VPseudoIStoreNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>; + VPseudoIStoreNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>, + VSXSched<eew, Order>; def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_MASK" : - VPseudoIStoreMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>; + VPseudoIStoreMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered>, + VSXSched<eew, Order>; } } } @@ -1498,32 +1466,50 @@ multiclass VPseudoIStore<bit Ordered> { } } -multiclass VPseudoUnaryS_M { +multiclass VPseudoVPOP_M { foreach mti = AllMasks in { let VLMul = mti.LMul.value in { - def "_M_" # mti.BX : VPseudoUnaryNoMask<GPR, VR>; - def "_M_" # mti.BX # "_MASK" : VPseudoMaskUnarySOutMask; + def "_M_" # mti.BX : VPseudoUnaryNoMask<GPR, VR>, + Sched<[WriteVMPopV, ReadVMPopV, ReadVMPopV]>; + def "_M_" # mti.BX # "_MASK" : VPseudoMaskUnarySOutMask, + Sched<[WriteVMPopV, ReadVMPopV, ReadVMPopV]>; } } } -multiclass VPseudoUnaryM_M { +multiclass VPseudoV1ST_M { + foreach mti = AllMasks in + { + let VLMul = mti.LMul.value in { + def "_M_" # mti.BX : VPseudoUnaryNoMask<GPR, VR>, + Sched<[WriteVMFFSV, ReadVMFFSV, ReadVMFFSV]>; + def "_M_" # mti.BX # "_MASK" : VPseudoMaskUnarySOutMask, + Sched<[WriteVMFFSV, ReadVMFFSV, ReadVMFFSV]>; + } + } +} + +multiclass VPseudoVSFS_M { defvar constraint = "@earlyclobber $rd"; foreach mti = AllMasks in { let VLMul = mti.LMul.value in { - def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>; - def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>; + def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>, + Sched<[WriteVMSFSV, ReadVMSFSV, ReadVMask]>; + def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>, + Sched<[WriteVMSFSV, ReadVMSFSV, ReadVMask]>; } } } -multiclass VPseudoMaskNullaryV { +multiclass VPseudoVID_V { foreach m = MxList.m in { let VLMul = m.value in { - def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>; - def "_V_" # m.MX # "_MASK" : VPseudoNullaryMask<m.vrclass>; + def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>, + Sched<[WriteVMIdxV, ReadVMask]>; + def "_V_" # m.MX # "_MASK" : VPseudoNullaryMask<m.vrclass>, + Sched<[WriteVMIdxV, ReadVMask]>; } } } @@ -1536,20 +1522,23 @@ multiclass VPseudoNullaryPseudoM <string BaseInst> { } } -multiclass VPseudoUnaryV_M { +multiclass VPseudoVIOT_M { defvar constraint = "@earlyclobber $rd"; foreach m = MxList.m in { let VLMul = m.value in { - def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>; - def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>; + def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>, + Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>; + def "_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, VR, constraint>, + Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>; } } } -multiclass VPseudoUnaryV_V_AnyMask { +multiclass VPseudoVCPR_V { foreach m = MxList.m in { let VLMul = m.value in - def _VM # "_" # m.MX : VPseudoUnaryAnyMask<m.vrclass, m.vrclass>; + def _VM # "_" # m.MX : VPseudoUnaryAnyMask<m.vrclass, m.vrclass>, + Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>; } } @@ -1611,7 +1600,7 @@ multiclass VPseudoBinaryV_VV<string Constraint = ""> { defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>; } -multiclass VPseudoBinaryV_VV_EEW<int eew, string Constraint = ""> { +multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> { foreach m = MxList.m in { foreach sew = EEWList in { defvar octuple_lmul = m.octuple; @@ -1620,7 +1609,8 @@ multiclass VPseudoBinaryV_VV_EEW<int eew, string Constraint = ""> { if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then { defvar emulMX = octuple_to_str<octuple_emul>.ret; defvar emul = !cast<LMULInfo>("V_" # emulMX); - defm _VV : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul, Constraint>; + defm _VV : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul, Constraint>, + Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV]>; } } } @@ -1631,6 +1621,12 @@ multiclass VPseudoBinaryV_VX<string Constraint = ""> { defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>; } +multiclass VPseudoVSLD1_VX<string Constraint = ""> { + foreach m = MxList.m in + defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>, + Sched<[WriteVISlide1X, ReadVISlideV, ReadVISlideX, ReadVMask]>; +} + multiclass VPseudoBinaryV_VF<string Constraint = ""> { foreach m = MxList.m in foreach f = FPList.fpinfo in @@ -1638,15 +1634,24 @@ multiclass VPseudoBinaryV_VF<string Constraint = ""> { f.fprclass, m, Constraint>; } +multiclass VPseudoVSLD1_VF<string Constraint = ""> { + foreach m = MxList.m in + foreach f = FPList.fpinfo in + defm "_V" # f.FX : + VPseudoBinary<m.vrclass, m.vrclass, f.fprclass, m, Constraint>, + Sched<[WriteVFSlide1F, ReadVFSlideV, ReadVFSlideF, ReadVMask]>; +} + multiclass VPseudoBinaryV_VI<Operand ImmType = simm5, string Constraint = ""> { foreach m = MxList.m in defm _VI : VPseudoBinary<m.vrclass, m.vrclass, ImmType, m, Constraint>; } -multiclass VPseudoBinaryM_MM { +multiclass VPseudoVALU_MM { foreach m = MxList.m in let VLMul = m.value in { - def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">; + def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">, + Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>; } } @@ -1744,12 +1749,13 @@ multiclass VPseudoBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1, m.vrclass, GPR, m, CarryIn, Constraint>; } -multiclass VPseudoBinaryV_FM { +multiclass VPseudoVMRG_FM { foreach m = MxList.m in foreach f = FPList.fpinfo in def "_V" # f.FX # "M_" # m.MX : VPseudoBinaryCarryIn<GetVRegNoV0<m.vrclass>.R, - m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">; + m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">, + Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>; } multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1, @@ -1762,76 +1768,102 @@ multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1, m.vrclass, simm5, m, CarryIn, Constraint>; } -multiclass VPseudoUnaryV_V_X_I_NoDummyMask { +multiclass VPseudoUnaryVMV_V_X_I { foreach m = MxList.m in { let VLMul = m.value in { - def "_V_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, m.vrclass>; - def "_X_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, GPR>; - def "_I_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, simm5>; + def "_V_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, m.vrclass>, + Sched<[WriteVIMovV, ReadVIMovV]>; + def "_X_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, GPR>, + Sched<[WriteVIMovX, ReadVIMovX]>; + def "_I_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, simm5>, + Sched<[WriteVIMovI]>; } } } -multiclass VPseudoUnaryV_F_NoDummyMask { +multiclass VPseudoVMV_F { foreach m = MxList.m in { foreach f = FPList.fpinfo in { let VLMul = m.value in { - def "_" # f.FX # "_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>; + def "_" # f.FX # "_" # m.MX : + VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>, + Sched<[WriteVFMovV, ReadVFMovF]>; } } } } -multiclass VPseudoUnaryTAV_V { +multiclass VPseudoVCLS_V { foreach m = MxList.m in { let VLMul = m.value in { - def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>; - def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>; + def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, + Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>; + def "_V_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>, + Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>; } } } -multiclass VPseudoUnaryV_V { +multiclass VPseudoVSQR_V { foreach m = MxList.m in { let VLMul = m.value in { - def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>; - def "_V_" # m.MX # "_MASK" : VPseudoUnaryMask<m.vrclass, m.vrclass>; + def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, + Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>; + def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>, + Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>; } } } -multiclass PseudoUnaryV_VF2 { +multiclass VPseudoVRCP_V { + foreach m = MxList.m in { + let VLMul = m.value in { + def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>, + Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>; + def "_V_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.vrclass>, + Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>; + } + } +} + +multiclass PseudoVEXT_VF2 { defvar constraints = "@earlyclobber $rd"; foreach m = MxListVF2.m in { let VLMul = m.value in { - def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>; - def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f2vrclass, - constraints>; + def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>, + Sched<[WriteVExtV, ReadVExtV, ReadVMask]>; + def "_" # m.MX # "_MASK" : + VPseudoUnaryMaskTA<m.vrclass, m.f2vrclass, constraints>, + Sched<[WriteVExtV, ReadVExtV, ReadVMask]>; } } } -multiclass PseudoUnaryV_VF4 { +multiclass PseudoVEXT_VF4 { defvar constraints = "@earlyclobber $rd"; foreach m = MxListVF4.m in { let VLMul = m.value in { - def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>; - def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f4vrclass, - constraints>; + def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>, + Sched<[WriteVExtV, ReadVExtV, ReadVMask]>; + def "_" # m.MX # "_MASK" : + VPseudoUnaryMaskTA<m.vrclass, m.f4vrclass, constraints>, + Sched<[WriteVExtV, ReadVExtV, ReadVMask]>; } } } -multiclass PseudoUnaryV_VF8 { +multiclass PseudoVEXT_VF8 { defvar constraints = "@earlyclobber $rd"; foreach m = MxListVF8.m in { let VLMul = m.value in { - def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>; - def "_" # m.MX # "_MASK" : VPseudoUnaryMaskTA<m.vrclass, m.f8vrclass, - constraints>; + def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>, + Sched<[WriteVExtV, ReadVExtV, ReadVMask]>; + def "_" # m.MX # "_MASK" : + VPseudoUnaryMaskTA<m.vrclass, m.f8vrclass, constraints>, + Sched<[WriteVExtV, ReadVExtV, ReadVMask]>; } } } @@ -1874,30 +1906,172 @@ multiclass VPseudoBinaryM_VI { !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>; } -multiclass VPseudoBinaryV_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> { - defm "" : VPseudoBinaryV_VV<Constraint>; - defm "" : VPseudoBinaryV_VX<Constraint>; - defm "" : VPseudoBinaryV_VI<ImmType, Constraint>; +multiclass VPseudoVGTR_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> { + defm "" : VPseudoBinaryV_VV<Constraint>, + Sched<[WriteVGatherV, ReadVGatherV, ReadVGatherV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX<Constraint>, + Sched<[WriteVGatherX, ReadVGatherV, ReadVGatherX, ReadVMask]>; + defm "" : VPseudoBinaryV_VI<ImmType, Constraint>, + Sched<[WriteVGatherI, ReadVGatherV, ReadVMask]>; } -multiclass VPseudoBinaryV_VV_VX { - defm "" : VPseudoBinaryV_VV; - defm "" : VPseudoBinaryV_VX; +multiclass VPseudoVSALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> { + defm "" : VPseudoBinaryV_VV<Constraint>, + Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX<Constraint>, + Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>; + defm "" : VPseudoBinaryV_VI<ImmType, Constraint>, + Sched<[WriteVSALUI, ReadVSALUV, ReadVMask]>; } -multiclass VPseudoBinaryV_VV_VF { - defm "" : VPseudoBinaryV_VV; - defm "" : VPseudoBinaryV_VF; + +multiclass VPseudoVSHT_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> { + defm "" : VPseudoBinaryV_VV<Constraint>, + Sched<[WriteVShiftV, ReadVShiftV, ReadVShiftV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX<Constraint>, + Sched<[WriteVShiftX, ReadVShiftV, ReadVShiftX, ReadVMask]>; + defm "" : VPseudoBinaryV_VI<ImmType, Constraint>, + Sched<[WriteVShiftI, ReadVShiftV, ReadVMask]>; } -multiclass VPseudoBinaryV_VX_VI<Operand ImmType = simm5> { - defm "" : VPseudoBinaryV_VX; - defm "" : VPseudoBinaryV_VI<ImmType>; +multiclass VPseudoVSSHT_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> { + defm "" : VPseudoBinaryV_VV<Constraint>, + Sched<[WriteVSShiftV, ReadVSShiftV, ReadVSShiftV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX<Constraint>, + Sched<[WriteVSShiftX, ReadVSShiftV, ReadVSShiftX, ReadVMask]>; + defm "" : VPseudoBinaryV_VI<ImmType, Constraint>, + Sched<[WriteVSShiftI, ReadVSShiftV, ReadVMask]>; } -multiclass VPseudoBinaryW_VV_VX { - defm "" : VPseudoBinaryW_VV; - defm "" : VPseudoBinaryW_VX; +multiclass VPseudoVALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> { + defm "" : VPseudoBinaryV_VV<Constraint>, + Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX<Constraint>, + Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>; + defm "" : VPseudoBinaryV_VI<ImmType, Constraint>, + Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>; +} + +multiclass VPseudoVSALU_VV_VX { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVSALUV, ReadVSALUV, ReadVSALUV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX, + Sched<[WriteVSALUX, ReadVSALUV, ReadVSALUX, ReadVMask]>; +} + +multiclass VPseudoVSMUL_VV_VX { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVSMulV, ReadVSMulV, ReadVSMulV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX, + Sched<[WriteVSMulX, ReadVSMulV, ReadVSMulX, ReadVMask]>; +} + +multiclass VPseudoVAALU_VV_VX { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVAALUV, ReadVAALUV, ReadVAALUV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX, + Sched<[WriteVAALUX, ReadVAALUV, ReadVAALUX, ReadVMask]>; +} + +multiclass VPseudoVMINMAX_VV_VX { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX, + Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>; +} + +multiclass VPseudoVMUL_VV_VX { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVIMulV, ReadVIMulV, ReadVIMulV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX, + Sched<[WriteVIMulX, ReadVIMulV, ReadVIMulX, ReadVMask]>; +} + +multiclass VPseudoVDIV_VV_VX { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVIDivV, ReadVIDivV, ReadVIDivV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX, + Sched<[WriteVIDivX, ReadVIDivV, ReadVIDivX, ReadVMask]>; +} + +multiclass VPseudoVFMUL_VV_VF { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVFMulV, ReadVFMulV, ReadVFMulV, ReadVMask]>; + defm "" : VPseudoBinaryV_VF, + Sched<[WriteVFMulF, ReadVFMulV, ReadVFMulF, ReadVMask]>; +} + +multiclass VPseudoVFDIV_VV_VF { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVFDivV, ReadVFDivV, ReadVFDivV, ReadVMask]>; + defm "" : VPseudoBinaryV_VF, + Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>; +} + +multiclass VPseudoVFRDIV_VF { + defm "" : VPseudoBinaryV_VF, + Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>; +} + +multiclass VPseudoVALU_VV_VX { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>; + defm "" : VPseudoBinaryV_VX, + Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>; +} + +multiclass VPseudoVSGNJ_VV_VF { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVFSgnjV, ReadVFSgnjV, ReadVFSgnjV, ReadVMask]>; + defm "" : VPseudoBinaryV_VF, + Sched<[WriteVFSgnjF, ReadVFSgnjV, ReadVFSgnjF, ReadVMask]>; +} + +multiclass VPseudoVMAX_VV_VF { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>; + defm "" : VPseudoBinaryV_VF, + Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>; +} + +multiclass VPseudoVALU_VV_VF { + defm "" : VPseudoBinaryV_VV, + Sched<[WriteVFALUV, ReadVFALUV, ReadVFALUV, ReadVMask]>; + defm "" : VPseudoBinaryV_VF, + Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>; +} + +multiclass VPseudoVALU_VF { + defm "" : VPseudoBinaryV_VF, + Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>; +} + +multiclass VPseudoVALU_VX_VI<Operand ImmType = simm5> { + defm "" : VPseudoBinaryV_VX, + Sched<[WriteVIALUX, ReadVIALUV, ReadVIALUX, ReadVMask]>; + defm "" : VPseudoBinaryV_VI<ImmType>, + Sched<[WriteVIALUI, ReadVIALUV, ReadVMask]>; +} + +multiclass VPseudoVWALU_VV_VX { + defm "" : VPseudoBinaryW_VV, + Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>; + defm "" : VPseudoBinaryW_VX, + Sched<[WriteVIWALUX, ReadVIWALUV, ReadVIWALUX, ReadVMask]>; +} + +multiclass VPseudoVWMUL_VV_VX { + defm "" : VPseudoBinaryW_VV, + Sched<[WriteVIWMulV, ReadVIWMulV, ReadVIWMulV, ReadVMask]>; + defm "" : VPseudoBinaryW_VX, + Sched<[WriteVIWMulX, ReadVIWMulV, ReadVIWMulX, ReadVMask]>; +} + +multiclass VPseudoVWMUL_VV_VF { + defm "" : VPseudoBinaryW_VV, + Sched<[WriteVFWMulV, ReadVFWMulV, ReadVFWMulV, ReadVMask]>; + defm "" : VPseudoBinaryW_VF, + Sched<[WriteVFWMulF, ReadVFWMulV, ReadVFWMulF, ReadVMask]>; } multiclass VPseudoBinaryW_VV_VF { @@ -1905,53 +2079,100 @@ multiclass VPseudoBinaryW_VV_VF { defm "" : VPseudoBinaryW_VF; } -multiclass VPseudoBinaryW_WV_WX { - defm "" : VPseudoBinaryW_WV; - defm "" : VPseudoBinaryW_WX; +multiclass VPseudoVWALU_WV_WX { + defm "" : VPseudoBinaryW_WV, + Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>; + defm "" : VPseudoBinaryW_WX, + Sched<[WriteVIWALUX, ReadVIWALUV, ReadVIWALUX, ReadVMask]>; +} + +multiclass VPseudoVFWALU_VV_VF { + defm "" : VPseudoBinaryW_VV, + Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>; + defm "" : VPseudoBinaryW_VF, + Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>; +} + +multiclass VPseudoVFWALU_WV_WF { + defm "" : VPseudoBinaryW_WV, + Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>; + defm "" : VPseudoBinaryW_WF, + Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>; +} + +multiclass VPseudoVMRG_VM_XM_IM { + defm "" : VPseudoBinaryV_VM, + Sched<[WriteVIMergeV, ReadVIMergeV, ReadVIMergeV, ReadVMask]>; + defm "" : VPseudoBinaryV_XM, + Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>; + defm "" : VPseudoBinaryV_IM, + Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>; } -multiclass VPseudoBinaryW_WV_WF { - defm "" : VPseudoBinaryW_WV; - defm "" : VPseudoBinaryW_WF; +multiclass VPseudoVCALU_VM_XM_IM { + defm "" : VPseudoBinaryV_VM, + Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>; + defm "" : VPseudoBinaryV_XM, + Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>; + defm "" : VPseudoBinaryV_IM, + Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>; } -multiclass VPseudoBinaryV_VM_XM_IM { - defm "" : VPseudoBinaryV_VM; - defm "" : VPseudoBinaryV_XM; - defm "" : VPseudoBinaryV_IM; +multiclass VPseudoVCALU_VM_XM { + defm "" : VPseudoBinaryV_VM, + Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>; + defm "" : VPseudoBinaryV_XM, + Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>; } -multiclass VPseudoBinaryV_VM_XM { - defm "" : VPseudoBinaryV_VM; - defm "" : VPseudoBinaryV_XM; +multiclass VPseudoVCALUM_VM_XM_IM<string Constraint> { + defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>, + Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>; + defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>, + Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>; + defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>, + Sched<[WriteVICALUI, ReadVIALUCV, ReadVMask]>; } -multiclass VPseudoBinaryM_VM_XM_IM<string Constraint> { - defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>; - defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>; - defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>; +multiclass VPseudoVCALUM_VM_XM<string Constraint> { + defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>, + Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV, ReadVMask]>; + defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>, + Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX, ReadVMask]>; } -multiclass VPseudoBinaryM_VM_XM<string Constraint> { - defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>; - defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/1, Constraint>; +multiclass VPseudoVCALUM_V_X_I<string Constraint> { + defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>, + Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>; + defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>, + Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>; + defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>, + Sched<[WriteVICALUI, ReadVIALUCV]>; } -multiclass VPseudoBinaryM_V_X_I<string Constraint> { - defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>; - defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>; - defm "" : VPseudoBinaryV_IM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>; +multiclass VPseudoVCALUM_V_X<string Constraint> { + defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>, + Sched<[WriteVICALUV, ReadVIALUCV, ReadVIALUCV]>; + defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>, + Sched<[WriteVICALUX, ReadVIALUCV, ReadVIALUCX]>; } -multiclass VPseudoBinaryM_V_X<string Constraint> { - defm "" : VPseudoBinaryV_VM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>; - defm "" : VPseudoBinaryV_XM</*CarryOut=*/1, /*CarryIn=*/0, Constraint>; +multiclass VPseudoVNCLP_WV_WX_WI { + defm "" : VPseudoBinaryV_WV, + Sched<[WriteVNClipV, ReadVNClipV, ReadVNClipV, ReadVMask]>; + defm "" : VPseudoBinaryV_WX, + Sched<[WriteVNClipX, ReadVNClipV, ReadVNClipX, ReadVMask]>; + defm "" : VPseudoBinaryV_WI, + Sched<[WriteVNClipI, ReadVNClipV, ReadVMask]>; } -multiclass VPseudoBinaryV_WV_WX_WI { - defm "" : VPseudoBinaryV_WV; - defm "" : VPseudoBinaryV_WX; - defm "" : VPseudoBinaryV_WI; +multiclass VPseudoVNSHT_WV_WX_WI { + defm "" : VPseudoBinaryV_WV, + Sched<[WriteVNShiftV, ReadVNShiftV, ReadVNShiftV, ReadVMask]>; + defm "" : VPseudoBinaryV_WX, + Sched<[WriteVNShiftX, ReadVNShiftV, ReadVNShiftX, ReadVMask]>; + defm "" : VPseudoBinaryV_WI, + Sched<[WriteVNShiftI, ReadVNShiftV, ReadVMask]>; } multiclass VPseudoTernary<VReg RetClass, @@ -2031,55 +2252,113 @@ multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> { defm _VI : VPseudoTernary<m.vrclass, m.vrclass, ImmType, m, Constraint>; } -multiclass VPseudoTernaryV_VV_VX_AAXA<string Constraint = ""> { - defm "" : VPseudoTernaryV_VV_AAXA<Constraint>; - defm "" : VPseudoTernaryV_VX_AAXA<Constraint>; +multiclass VPseudoVMAC_VV_VX_AAXA<string Constraint = ""> { + defm "" : VPseudoTernaryV_VV_AAXA<Constraint>, + Sched<[WriteVIMulAddV, ReadVIMulAddV, ReadVIMulAddV, ReadVIMulAddV, ReadVMask]>; + defm "" : VPseudoTernaryV_VX_AAXA<Constraint>, + Sched<[WriteVIMulAddX, ReadVIMulAddV, ReadVIMulAddV, ReadVIMulAddX, ReadVMask]>; } -multiclass VPseudoTernaryV_VV_VF_AAXA<string Constraint = ""> { - defm "" : VPseudoTernaryV_VV_AAXA<Constraint>; - defm "" : VPseudoTernaryV_VF_AAXA<Constraint>; +multiclass VPseudoVMAC_VV_VF_AAXA<string Constraint = ""> { + defm "" : VPseudoTernaryV_VV_AAXA<Constraint>, + Sched<[WriteVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVMask]>; + defm "" : VPseudoTernaryV_VF_AAXA<Constraint>, + Sched<[WriteVFMulAddF, ReadVFMulAddV, ReadVFMulAddV, ReadVFMulAddF, ReadVMask]>; } -multiclass VPseudoTernaryV_VX_VI<Operand ImmType = simm5, string Constraint = ""> { - defm "" : VPseudoTernaryV_VX<Constraint>; - defm "" : VPseudoTernaryV_VI<ImmType, Constraint>; +multiclass VPseudoVSLD_VX_VI<Operand ImmType = simm5, string Constraint = ""> { + defm "" : VPseudoTernaryV_VX<Constraint>, + Sched<[WriteVISlideX, ReadVISlideV, ReadVISlideV, ReadVISlideX, ReadVMask]>; + defm "" : VPseudoTernaryV_VI<ImmType, Constraint>, + Sched<[WriteVISlideI, ReadVISlideV, ReadVISlideV, ReadVMask]>; } -multiclass VPseudoTernaryW_VV_VX { - defm "" : VPseudoTernaryW_VV; - defm "" : VPseudoTernaryW_VX; +multiclass VPseudoVWMAC_VV_VX { + defm "" : VPseudoTernaryW_VV, + Sched<[WriteVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddV, ReadVMask]>; + defm "" : VPseudoTernaryW_VX, + Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>; } -multiclass VPseudoTernaryW_VV_VF { - defm "" : VPseudoTernaryW_VV; - defm "" : VPseudoTernaryW_VF; +multiclass VPseudoVWMAC_VX { + defm "" : VPseudoTernaryW_VX, + Sched<[WriteVIWMulAddX, ReadVIWMulAddV, ReadVIWMulAddV, ReadVIWMulAddX, ReadVMask]>; } -multiclass VPseudoBinaryM_VV_VX_VI { - defm "" : VPseudoBinaryM_VV; - defm "" : VPseudoBinaryM_VX; - defm "" : VPseudoBinaryM_VI; +multiclass VPseudoVWMAC_VV_VF { + defm "" : VPseudoTernaryW_VV, + Sched<[WriteVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVMask]>; + defm "" : VPseudoTernaryW_VF, + Sched<[WriteVFWMulAddF, ReadVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddF, ReadVMask]>; } -multiclass VPseudoBinaryM_VV_VX { - defm "" : VPseudoBinaryM_VV; - defm "" : VPseudoBinaryM_VX; +multiclass VPseudoVCMPM_VV_VX_VI { + defm "" : VPseudoBinaryM_VV, + Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>; + defm "" : VPseudoBinaryM_VX, + Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>; + defm "" : VPseudoBinaryM_VI, + Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>; } -multiclass VPseudoBinaryM_VV_VF { - defm "" : VPseudoBinaryM_VV; - defm "" : VPseudoBinaryM_VF; +multiclass VPseudoVCMPM_VV_VX { + defm "" : VPseudoBinaryM_VV, + Sched<[WriteVICmpV, ReadVICmpV, ReadVICmpV, ReadVMask]>; + defm "" : VPseudoBinaryM_VX, + Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>; } -multiclass VPseudoBinaryM_VX_VI { - defm "" : VPseudoBinaryM_VX; - defm "" : VPseudoBinaryM_VI; +multiclass VPseudoVCMPM_VV_VF { + defm "" : VPseudoBinaryM_VV, + Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>; + defm "" : VPseudoBinaryM_VF, + Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>; } -multiclass VPseudoReductionV_VS { +multiclass VPseudoVCMPM_VF { + defm "" : VPseudoBinaryM_VF, + Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>; +} + +multiclass VPseudoVCMPM_VX_VI { + defm "" : VPseudoBinaryM_VX, + Sched<[WriteVICmpX, ReadVICmpV, ReadVICmpX, ReadVMask]>; + defm "" : VPseudoBinaryM_VI, + Sched<[WriteVICmpI, ReadVICmpV, ReadVMask]>; +} + +multiclass VPseudoVRED_VS { foreach m = MxList.m in { - defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>; + defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, + Sched<[WriteVIRedV, ReadVIRedV, ReadVIRedV, ReadVIRedV, ReadVMask]>; + } +} + +multiclass VPseudoVWRED_VS { + foreach m = MxList.m in { + defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, + Sched<[WriteVIWRedV, ReadVIWRedV, ReadVIWRedV, ReadVIWRedV, ReadVMask]>; + } +} + +multiclass VPseudoVFRED_VS { + foreach m = MxList.m in { + defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, + Sched<[WriteVFRedV, ReadVFRedV, ReadVFRedV, ReadVFRedV, ReadVMask]>; + } +} + +multiclass VPseudoVFREDO_VS { + foreach m = MxList.m in { + defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, + Sched<[WriteVFRedOV, ReadVFRedOV, ReadVFRedOV, ReadVFRedOV, ReadVMask]>; + } +} + +multiclass VPseudoVFWRED_VS { + foreach m = MxList.m in { + defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>, + Sched<[WriteVFWRedV, ReadVFWRedV, ReadVFWRedV, ReadVFWRedV, ReadVMask]>; } } @@ -2094,9 +2373,16 @@ multiclass VPseudoConversion<VReg RetClass, } } -multiclass VPseudoConversionV_V { +multiclass VPseudoVCVTI_V { + foreach m = MxList.m in + defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>, + Sched<[WriteVFCvtFToIV, ReadVFCvtFToIV, ReadVMask]>; +} + +multiclass VPseudoVCVTF_V { foreach m = MxList.m in - defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>; + defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>, + Sched<[WriteVFCvtIToFV, ReadVFCvtIToFV, ReadVMask]>; } multiclass VPseudoConversionW_V { @@ -2105,10 +2391,46 @@ multiclass VPseudoConversionW_V { defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>; } -multiclass VPseudoConversionV_W { +multiclass VPseudoVWCVTI_V { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxList.m[0-5] in + defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>, + Sched<[WriteVFWCvtFToIV, ReadVFWCvtFToIV, ReadVMask]>; +} + +multiclass VPseudoVWCVTF_V { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxList.m[0-5] in + defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>, + Sched<[WriteVFWCvtIToFV, ReadVFWCvtIToFV, ReadVMask]>; +} + +multiclass VPseudoVWCVTD_V { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxList.m[0-5] in + defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>, + Sched<[WriteVFWCvtFToFV, ReadVFWCvtFToFV, ReadVMask]>; +} + +multiclass VPseudoVNCVTI_W { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxList.m[0-5] in + defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>, + Sched<[WriteVFNCvtFToIV, ReadVFNCvtFToIV, ReadVMask]>; +} + +multiclass VPseudoVNCVTF_W { + defvar constraint = "@earlyclobber $rd"; + foreach m = MxList.m[0-5] in + defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>, + Sched<[WriteVFNCvtIToFV, ReadVFNCvtIToFV, ReadVMask]>; +} + +multiclass VPseudoVNCVTD_W { defvar constraint = "@earlyclobber $rd"; foreach m = MxListW.m in - defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>; + defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>, + Sched<[WriteVFNCvtFToFV, ReadVFNCvtFToFV, ReadVMask]>; } multiclass VPseudoUSSegLoad<bit isFF> { @@ -2543,42 +2865,6 @@ class VPatTernaryMask<string intrinsic, (mask_type V0), GPR:$vl, sew)>; -class VPatAMOWDNoMask<string intrinsic_name, - string inst, - ValueType result_type, - ValueType op1_type, - int sew, - LMULInfo vlmul, - LMULInfo emul, - VReg op1_reg_class> : - Pat<(result_type (!cast<Intrinsic>(intrinsic_name) - GPR:$rs1, - (op1_type op1_reg_class:$vs2), - (result_type vlmul.vrclass:$vd), - VLOpFrag)), - (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX) - $rs1, $vs2, $vd, - GPR:$vl, sew)>; - -class VPatAMOWDMask<string intrinsic_name, - string inst, - ValueType result_type, - ValueType op1_type, - ValueType mask_type, - int sew, - LMULInfo vlmul, - LMULInfo emul, - VReg op1_reg_class> : - Pat<(result_type (!cast<Intrinsic>(intrinsic_name # "_mask") - GPR:$rs1, - (op1_type op1_reg_class:$vs2), - (result_type vlmul.vrclass:$vd), - (mask_type V0), - VLOpFrag)), - (!cast<Instruction>(inst # "_WD_" # vlmul.MX # "_" # emul.MX # "_MASK") - $rs1, $vs2, $vd, - (mask_type V0), GPR:$vl, sew)>; - multiclass VPatUnaryS_M<string intrinsic_name, string inst> { @@ -3416,44 +3702,6 @@ multiclass VPatConversionVF_WF <string intrinsic, string instruction> { } } -multiclass VPatAMOWD<string intrinsic, - string inst, - ValueType result_type, - ValueType offset_type, - ValueType mask_type, - int sew, - LMULInfo vlmul, - LMULInfo emul, - VReg op1_reg_class> -{ - def : VPatAMOWDNoMask<intrinsic, inst, result_type, offset_type, - sew, vlmul, emul, op1_reg_class>; - def : VPatAMOWDMask<intrinsic, inst, result_type, offset_type, - mask_type, sew, vlmul, emul, op1_reg_class>; -} - -multiclass VPatAMOV_WD<string intrinsic, - string inst, - list<VTypeInfo> vtilist> { - foreach eew = EEWList in { - foreach vti = vtilist in { - if !or(!eq(vti.SEW, 32), !eq(vti.SEW, 64)) then { - defvar octuple_lmul = vti.LMul.octuple; - // Calculate emul = eew * lmul / sew - defvar octuple_emul = !srl(!mul(eew, octuple_lmul), vti.Log2SEW); - if !and(!ge(octuple_emul, 1), !le(octuple_emul, 64)) then { - defvar emulMX = octuple_to_str<octuple_emul>.ret; - defvar offsetVti = !cast<VTypeInfo>("VI" # eew # emulMX); - defvar inst_ei = inst # "EI" # eew; - defm : VPatAMOWD<intrinsic, inst_ei, - vti.Vector, offsetVti.Vector, - vti.Mask, vti.Log2SEW, vti.LMul, offsetVti.LMul, offsetVti.RegClass>; - } - } - } - } -} - //===----------------------------------------------------------------------===// // Pseudo instructions //===----------------------------------------------------------------------===// @@ -3531,11 +3779,13 @@ def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp:$vtypei), //===----------------------------------------------------------------------===// // Pseudos Unit-Stride Loads and Stores -defm PseudoVL : VPseudoUSLoad</*isFF=*/false>; +defm PseudoVL : VPseudoUSLoad; defm PseudoVS : VPseudoUSStore; -defm PseudoVLM : VPseudoLoadMask; -defm PseudoVSM : VPseudoStoreMask; +defm PseudoVLM : VPseudoLoadMask, + Sched<[WriteVLDM, ReadVLDX]>; +defm PseudoVSM : VPseudoStoreMask, + Sched<[WriteVSTM, ReadVSTX]>; //===----------------------------------------------------------------------===// // 7.5 Vector Strided Instructions @@ -3561,7 +3811,7 @@ defm PseudoVSUX : VPseudoIStore</*Ordered=*/false>; // vleff may update VL register let hasSideEffects = 1, Defs = [VL] in -defm PseudoVL : VPseudoUSLoad</*isFF=*/true>; +defm PseudoVL : VPseudoFFLoad; //===----------------------------------------------------------------------===// // 7.8. Vector Load/Store Segment Instructions @@ -3580,28 +3830,15 @@ let hasSideEffects = 1, Defs = [VL] in defm PseudoVLSEG : VPseudoUSSegLoad</*isFF=*/true>; //===----------------------------------------------------------------------===// -// 8. Vector AMO Operations -//===----------------------------------------------------------------------===// -defm PseudoVAMOSWAP : VPseudoAMO; -defm PseudoVAMOADD : VPseudoAMO; -defm PseudoVAMOXOR : VPseudoAMO; -defm PseudoVAMOAND : VPseudoAMO; -defm PseudoVAMOOR : VPseudoAMO; -defm PseudoVAMOMIN : VPseudoAMO; -defm PseudoVAMOMAX : VPseudoAMO; -defm PseudoVAMOMINU : VPseudoAMO; -defm PseudoVAMOMAXU : VPseudoAMO; - -//===----------------------------------------------------------------------===// // 12. Vector Integer Arithmetic Instructions //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // 12.1. Vector Single-Width Integer Add and Subtract //===----------------------------------------------------------------------===// -defm PseudoVADD : VPseudoBinaryV_VV_VX_VI; -defm PseudoVSUB : VPseudoBinaryV_VV_VX; -defm PseudoVRSUB : VPseudoBinaryV_VX_VI; +defm PseudoVADD : VPseudoVALU_VV_VX_VI; +defm PseudoVSUB : VPseudoVALU_VV_VX; +defm PseudoVRSUB : VPseudoVALU_VX_VI; foreach vti = AllIntegerVectors in { // Match vrsub with 2 vector operands to vsub.vv by swapping operands. This @@ -3657,166 +3894,166 @@ foreach vti = AllIntegerVectors in { //===----------------------------------------------------------------------===// // 12.2. Vector Widening Integer Add/Subtract //===----------------------------------------------------------------------===// -defm PseudoVWADDU : VPseudoBinaryW_VV_VX; -defm PseudoVWSUBU : VPseudoBinaryW_VV_VX; -defm PseudoVWADD : VPseudoBinaryW_VV_VX; -defm PseudoVWSUB : VPseudoBinaryW_VV_VX; -defm PseudoVWADDU : VPseudoBinaryW_WV_WX; -defm PseudoVWSUBU : VPseudoBinaryW_WV_WX; -defm PseudoVWADD : VPseudoBinaryW_WV_WX; -defm PseudoVWSUB : VPseudoBinaryW_WV_WX; +defm PseudoVWADDU : VPseudoVWALU_VV_VX; +defm PseudoVWSUBU : VPseudoVWALU_VV_VX; +defm PseudoVWADD : VPseudoVWALU_VV_VX; +defm PseudoVWSUB : VPseudoVWALU_VV_VX; +defm PseudoVWADDU : VPseudoVWALU_WV_WX; +defm PseudoVWSUBU : VPseudoVWALU_WV_WX; +defm PseudoVWADD : VPseudoVWALU_WV_WX; +defm PseudoVWSUB : VPseudoVWALU_WV_WX; //===----------------------------------------------------------------------===// // 12.3. Vector Integer Extension //===----------------------------------------------------------------------===// -defm PseudoVZEXT_VF2 : PseudoUnaryV_VF2; -defm PseudoVZEXT_VF4 : PseudoUnaryV_VF4; -defm PseudoVZEXT_VF8 : PseudoUnaryV_VF8; -defm PseudoVSEXT_VF2 : PseudoUnaryV_VF2; -defm PseudoVSEXT_VF4 : PseudoUnaryV_VF4; -defm PseudoVSEXT_VF8 : PseudoUnaryV_VF8; +defm PseudoVZEXT_VF2 : PseudoVEXT_VF2; +defm PseudoVZEXT_VF4 : PseudoVEXT_VF4; +defm PseudoVZEXT_VF8 : PseudoVEXT_VF8; +defm PseudoVSEXT_VF2 : PseudoVEXT_VF2; +defm PseudoVSEXT_VF4 : PseudoVEXT_VF4; +defm PseudoVSEXT_VF8 : PseudoVEXT_VF8; //===----------------------------------------------------------------------===// // 12.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions //===----------------------------------------------------------------------===// -defm PseudoVADC : VPseudoBinaryV_VM_XM_IM; -defm PseudoVMADC : VPseudoBinaryM_VM_XM_IM<"@earlyclobber $rd">; -defm PseudoVMADC : VPseudoBinaryM_V_X_I<"@earlyclobber $rd">; +defm PseudoVADC : VPseudoVCALU_VM_XM_IM; +defm PseudoVMADC : VPseudoVCALUM_VM_XM_IM<"@earlyclobber $rd">; +defm PseudoVMADC : VPseudoVCALUM_V_X_I<"@earlyclobber $rd">; -defm PseudoVSBC : VPseudoBinaryV_VM_XM; -defm PseudoVMSBC : VPseudoBinaryM_VM_XM<"@earlyclobber $rd">; -defm PseudoVMSBC : VPseudoBinaryM_V_X<"@earlyclobber $rd">; +defm PseudoVSBC : VPseudoVCALU_VM_XM; +defm PseudoVMSBC : VPseudoVCALUM_VM_XM<"@earlyclobber $rd">; +defm PseudoVMSBC : VPseudoVCALUM_V_X<"@earlyclobber $rd">; //===----------------------------------------------------------------------===// // 12.5. Vector Bitwise Logical Instructions //===----------------------------------------------------------------------===// -defm PseudoVAND : VPseudoBinaryV_VV_VX_VI; -defm PseudoVOR : VPseudoBinaryV_VV_VX_VI; -defm PseudoVXOR : VPseudoBinaryV_VV_VX_VI; +defm PseudoVAND : VPseudoVALU_VV_VX_VI; +defm PseudoVOR : VPseudoVALU_VV_VX_VI; +defm PseudoVXOR : VPseudoVALU_VV_VX_VI; //===----------------------------------------------------------------------===// // 12.6. Vector Single-Width Bit Shift Instructions //===----------------------------------------------------------------------===// -defm PseudoVSLL : VPseudoBinaryV_VV_VX_VI<uimm5>; -defm PseudoVSRL : VPseudoBinaryV_VV_VX_VI<uimm5>; -defm PseudoVSRA : VPseudoBinaryV_VV_VX_VI<uimm5>; +defm PseudoVSLL : VPseudoVSHT_VV_VX_VI<uimm5>; +defm PseudoVSRL : VPseudoVSHT_VV_VX_VI<uimm5>; +defm PseudoVSRA : VPseudoVSHT_VV_VX_VI<uimm5>; //===----------------------------------------------------------------------===// // 12.7. Vector Narrowing Integer Right Shift Instructions //===----------------------------------------------------------------------===// -defm PseudoVNSRL : VPseudoBinaryV_WV_WX_WI; -defm PseudoVNSRA : VPseudoBinaryV_WV_WX_WI; +defm PseudoVNSRL : VPseudoVNSHT_WV_WX_WI; +defm PseudoVNSRA : VPseudoVNSHT_WV_WX_WI; //===----------------------------------------------------------------------===// // 12.8. Vector Integer Comparison Instructions //===----------------------------------------------------------------------===// -defm PseudoVMSEQ : VPseudoBinaryM_VV_VX_VI; -defm PseudoVMSNE : VPseudoBinaryM_VV_VX_VI; -defm PseudoVMSLTU : VPseudoBinaryM_VV_VX; -defm PseudoVMSLT : VPseudoBinaryM_VV_VX; -defm PseudoVMSLEU : VPseudoBinaryM_VV_VX_VI; -defm PseudoVMSLE : VPseudoBinaryM_VV_VX_VI; -defm PseudoVMSGTU : VPseudoBinaryM_VX_VI; -defm PseudoVMSGT : VPseudoBinaryM_VX_VI; +defm PseudoVMSEQ : VPseudoVCMPM_VV_VX_VI; +defm PseudoVMSNE : VPseudoVCMPM_VV_VX_VI; +defm PseudoVMSLTU : VPseudoVCMPM_VV_VX; +defm PseudoVMSLT : VPseudoVCMPM_VV_VX; +defm PseudoVMSLEU : VPseudoVCMPM_VV_VX_VI; +defm PseudoVMSLE : VPseudoVCMPM_VV_VX_VI; +defm PseudoVMSGTU : VPseudoVCMPM_VX_VI; +defm PseudoVMSGT : VPseudoVCMPM_VX_VI; //===----------------------------------------------------------------------===// // 12.9. Vector Integer Min/Max Instructions //===----------------------------------------------------------------------===// -defm PseudoVMINU : VPseudoBinaryV_VV_VX; -defm PseudoVMIN : VPseudoBinaryV_VV_VX; -defm PseudoVMAXU : VPseudoBinaryV_VV_VX; -defm PseudoVMAX : VPseudoBinaryV_VV_VX; +defm PseudoVMINU : VPseudoVMINMAX_VV_VX; +defm PseudoVMIN : VPseudoVMINMAX_VV_VX; +defm PseudoVMAXU : VPseudoVMINMAX_VV_VX; +defm PseudoVMAX : VPseudoVMINMAX_VV_VX; //===----------------------------------------------------------------------===// // 12.10. Vector Single-Width Integer Multiply Instructions //===----------------------------------------------------------------------===// -defm PseudoVMUL : VPseudoBinaryV_VV_VX; -defm PseudoVMULH : VPseudoBinaryV_VV_VX; -defm PseudoVMULHU : VPseudoBinaryV_VV_VX; -defm PseudoVMULHSU : VPseudoBinaryV_VV_VX; +defm PseudoVMUL : VPseudoVMUL_VV_VX; +defm PseudoVMULH : VPseudoVMUL_VV_VX; +defm PseudoVMULHU : VPseudoVMUL_VV_VX; +defm PseudoVMULHSU : VPseudoVMUL_VV_VX; //===----------------------------------------------------------------------===// // 12.11. Vector Integer Divide Instructions //===----------------------------------------------------------------------===// -defm PseudoVDIVU : VPseudoBinaryV_VV_VX; -defm PseudoVDIV : VPseudoBinaryV_VV_VX; -defm PseudoVREMU : VPseudoBinaryV_VV_VX; -defm PseudoVREM : VPseudoBinaryV_VV_VX; +defm PseudoVDIVU : VPseudoVDIV_VV_VX; +defm PseudoVDIV : VPseudoVDIV_VV_VX; +defm PseudoVREMU : VPseudoVDIV_VV_VX; +defm PseudoVREM : VPseudoVDIV_VV_VX; //===----------------------------------------------------------------------===// // 12.12. Vector Widening Integer Multiply Instructions //===----------------------------------------------------------------------===// -defm PseudoVWMUL : VPseudoBinaryW_VV_VX; -defm PseudoVWMULU : VPseudoBinaryW_VV_VX; -defm PseudoVWMULSU : VPseudoBinaryW_VV_VX; +defm PseudoVWMUL : VPseudoVWMUL_VV_VX; +defm PseudoVWMULU : VPseudoVWMUL_VV_VX; +defm PseudoVWMULSU : VPseudoVWMUL_VV_VX; //===----------------------------------------------------------------------===// // 12.13. Vector Single-Width Integer Multiply-Add Instructions //===----------------------------------------------------------------------===// -defm PseudoVMACC : VPseudoTernaryV_VV_VX_AAXA; -defm PseudoVNMSAC : VPseudoTernaryV_VV_VX_AAXA; -defm PseudoVMADD : VPseudoTernaryV_VV_VX_AAXA; -defm PseudoVNMSUB : VPseudoTernaryV_VV_VX_AAXA; +defm PseudoVMACC : VPseudoVMAC_VV_VX_AAXA; +defm PseudoVNMSAC : VPseudoVMAC_VV_VX_AAXA; +defm PseudoVMADD : VPseudoVMAC_VV_VX_AAXA; +defm PseudoVNMSUB : VPseudoVMAC_VV_VX_AAXA; //===----------------------------------------------------------------------===// // 12.14. Vector Widening Integer Multiply-Add Instructions //===----------------------------------------------------------------------===// -defm PseudoVWMACCU : VPseudoTernaryW_VV_VX; -defm PseudoVWMACC : VPseudoTernaryW_VV_VX; -defm PseudoVWMACCSU : VPseudoTernaryW_VV_VX; -defm PseudoVWMACCUS : VPseudoTernaryW_VX; +defm PseudoVWMACCU : VPseudoVWMAC_VV_VX; +defm PseudoVWMACC : VPseudoVWMAC_VV_VX; +defm PseudoVWMACCSU : VPseudoVWMAC_VV_VX; +defm PseudoVWMACCUS : VPseudoVWMAC_VX; //===----------------------------------------------------------------------===// // 12.15. Vector Integer Merge Instructions //===----------------------------------------------------------------------===// -defm PseudoVMERGE : VPseudoBinaryV_VM_XM_IM; +defm PseudoVMERGE : VPseudoVMRG_VM_XM_IM; //===----------------------------------------------------------------------===// // 12.16. Vector Integer Move Instructions //===----------------------------------------------------------------------===// -defm PseudoVMV_V : VPseudoUnaryV_V_X_I_NoDummyMask; +defm PseudoVMV_V : VPseudoUnaryVMV_V_X_I; //===----------------------------------------------------------------------===// // 13.1. Vector Single-Width Saturating Add and Subtract //===----------------------------------------------------------------------===// let Defs = [VXSAT], hasSideEffects = 1 in { - defm PseudoVSADDU : VPseudoBinaryV_VV_VX_VI; - defm PseudoVSADD : VPseudoBinaryV_VV_VX_VI; - defm PseudoVSSUBU : VPseudoBinaryV_VV_VX; - defm PseudoVSSUB : VPseudoBinaryV_VV_VX; + defm PseudoVSADDU : VPseudoVSALU_VV_VX_VI; + defm PseudoVSADD : VPseudoVSALU_VV_VX_VI; + defm PseudoVSSUBU : VPseudoVSALU_VV_VX; + defm PseudoVSSUB : VPseudoVSALU_VV_VX; } //===----------------------------------------------------------------------===// // 13.2. Vector Single-Width Averaging Add and Subtract //===----------------------------------------------------------------------===// let Uses = [VXRM], hasSideEffects = 1 in { - defm PseudoVAADDU : VPseudoBinaryV_VV_VX; - defm PseudoVAADD : VPseudoBinaryV_VV_VX; - defm PseudoVASUBU : VPseudoBinaryV_VV_VX; - defm PseudoVASUB : VPseudoBinaryV_VV_VX; + defm PseudoVAADDU : VPseudoVAALU_VV_VX; + defm PseudoVAADD : VPseudoVAALU_VV_VX; + defm PseudoVASUBU : VPseudoVAALU_VV_VX; + defm PseudoVASUB : VPseudoVAALU_VV_VX; } //===----------------------------------------------------------------------===// // 13.3. Vector Single-Width Fractional Multiply with Rounding and Saturation //===----------------------------------------------------------------------===// let Uses = [VXRM], Defs = [VXSAT], hasSideEffects = 1 in { - defm PseudoVSMUL : VPseudoBinaryV_VV_VX; + defm PseudoVSMUL : VPseudoVSMUL_VV_VX; } //===----------------------------------------------------------------------===// // 13.4. Vector Single-Width Scaling Shift Instructions //===----------------------------------------------------------------------===// let Uses = [VXRM], hasSideEffects = 1 in { - defm PseudoVSSRL : VPseudoBinaryV_VV_VX_VI<uimm5>; - defm PseudoVSSRA : VPseudoBinaryV_VV_VX_VI<uimm5>; + defm PseudoVSSRL : VPseudoVSSHT_VV_VX_VI<uimm5>; + defm PseudoVSSRA : VPseudoVSSHT_VV_VX_VI<uimm5>; } //===----------------------------------------------------------------------===// // 13.5. Vector Narrowing Fixed-Point Clip Instructions //===----------------------------------------------------------------------===// let Uses = [VXRM], Defs = [VXSAT], hasSideEffects = 1 in { - defm PseudoVNCLIP : VPseudoBinaryV_WV_WX_WI; - defm PseudoVNCLIPU : VPseudoBinaryV_WV_WX_WI; + defm PseudoVNCLIP : VPseudoVNCLP_WV_WX_WI; + defm PseudoVNCLIPU : VPseudoVNCLP_WV_WX_WI; } } // Predicates = [HasVInstructions] @@ -3825,156 +4062,156 @@ let Predicates = [HasVInstructionsAnyF] in { //===----------------------------------------------------------------------===// // 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions //===----------------------------------------------------------------------===// -defm PseudoVFADD : VPseudoBinaryV_VV_VF; -defm PseudoVFSUB : VPseudoBinaryV_VV_VF; -defm PseudoVFRSUB : VPseudoBinaryV_VF; +defm PseudoVFADD : VPseudoVALU_VV_VF; +defm PseudoVFSUB : VPseudoVALU_VV_VF; +defm PseudoVFRSUB : VPseudoVALU_VF; //===----------------------------------------------------------------------===// // 14.3. Vector Widening Floating-Point Add/Subtract Instructions //===----------------------------------------------------------------------===// -defm PseudoVFWADD : VPseudoBinaryW_VV_VF; -defm PseudoVFWSUB : VPseudoBinaryW_VV_VF; -defm PseudoVFWADD : VPseudoBinaryW_WV_WF; -defm PseudoVFWSUB : VPseudoBinaryW_WV_WF; +defm PseudoVFWADD : VPseudoVFWALU_VV_VF; +defm PseudoVFWSUB : VPseudoVFWALU_VV_VF; +defm PseudoVFWADD : VPseudoVFWALU_WV_WF; +defm PseudoVFWSUB : VPseudoVFWALU_WV_WF; //===----------------------------------------------------------------------===// // 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions //===----------------------------------------------------------------------===// -defm PseudoVFMUL : VPseudoBinaryV_VV_VF; -defm PseudoVFDIV : VPseudoBinaryV_VV_VF; -defm PseudoVFRDIV : VPseudoBinaryV_VF; +defm PseudoVFMUL : VPseudoVFMUL_VV_VF; +defm PseudoVFDIV : VPseudoVFDIV_VV_VF; +defm PseudoVFRDIV : VPseudoVFRDIV_VF; //===----------------------------------------------------------------------===// // 14.5. Vector Widening Floating-Point Multiply //===----------------------------------------------------------------------===// -defm PseudoVFWMUL : VPseudoBinaryW_VV_VF; +defm PseudoVFWMUL : VPseudoVWMUL_VV_VF; //===----------------------------------------------------------------------===// // 14.6. Vector Single-Width Floating-Point Fused Multiply-Add Instructions //===----------------------------------------------------------------------===// -defm PseudoVFMACC : VPseudoTernaryV_VV_VF_AAXA; -defm PseudoVFNMACC : VPseudoTernaryV_VV_VF_AAXA; -defm PseudoVFMSAC : VPseudoTernaryV_VV_VF_AAXA; -defm PseudoVFNMSAC : VPseudoTernaryV_VV_VF_AAXA; -defm PseudoVFMADD : VPseudoTernaryV_VV_VF_AAXA; -defm PseudoVFNMADD : VPseudoTernaryV_VV_VF_AAXA; -defm PseudoVFMSUB : VPseudoTernaryV_VV_VF_AAXA; -defm PseudoVFNMSUB : VPseudoTernaryV_VV_VF_AAXA; +defm PseudoVFMACC : VPseudoVMAC_VV_VF_AAXA; +defm PseudoVFNMACC : VPseudoVMAC_VV_VF_AAXA; +defm PseudoVFMSAC : VPseudoVMAC_VV_VF_AAXA; +defm PseudoVFNMSAC : VPseudoVMAC_VV_VF_AAXA; +defm PseudoVFMADD : VPseudoVMAC_VV_VF_AAXA; +defm PseudoVFNMADD : VPseudoVMAC_VV_VF_AAXA; +defm PseudoVFMSUB : VPseudoVMAC_VV_VF_AAXA; +defm PseudoVFNMSUB : VPseudoVMAC_VV_VF_AAXA; //===----------------------------------------------------------------------===// // 14.7. Vector Widening Floating-Point Fused Multiply-Add Instructions //===----------------------------------------------------------------------===// -defm PseudoVFWMACC : VPseudoTernaryW_VV_VF; -defm PseudoVFWNMACC : VPseudoTernaryW_VV_VF; -defm PseudoVFWMSAC : VPseudoTernaryW_VV_VF; -defm PseudoVFWNMSAC : VPseudoTernaryW_VV_VF; +defm PseudoVFWMACC : VPseudoVWMAC_VV_VF; +defm PseudoVFWNMACC : VPseudoVWMAC_VV_VF; +defm PseudoVFWMSAC : VPseudoVWMAC_VV_VF; +defm PseudoVFWNMSAC : VPseudoVWMAC_VV_VF; //===----------------------------------------------------------------------===// // 14.8. Vector Floating-Point Square-Root Instruction //===----------------------------------------------------------------------===// -defm PseudoVFSQRT : VPseudoUnaryTAV_V; +defm PseudoVFSQRT : VPseudoVSQR_V; //===----------------------------------------------------------------------===// // 14.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction //===----------------------------------------------------------------------===// -defm PseudoVFRSQRT7 : VPseudoUnaryTAV_V; +defm PseudoVFRSQRT7 : VPseudoVRCP_V; //===----------------------------------------------------------------------===// // 14.10. Vector Floating-Point Reciprocal Estimate Instruction //===----------------------------------------------------------------------===// -defm PseudoVFREC7 : VPseudoUnaryTAV_V; +defm PseudoVFREC7 : VPseudoVRCP_V; //===----------------------------------------------------------------------===// // 14.11. Vector Floating-Point Min/Max Instructions //===----------------------------------------------------------------------===// -defm PseudoVFMIN : VPseudoBinaryV_VV_VF; -defm PseudoVFMAX : VPseudoBinaryV_VV_VF; +defm PseudoVFMIN : VPseudoVMAX_VV_VF; +defm PseudoVFMAX : VPseudoVMAX_VV_VF; //===----------------------------------------------------------------------===// // 14.12. Vector Floating-Point Sign-Injection Instructions //===----------------------------------------------------------------------===// -defm PseudoVFSGNJ : VPseudoBinaryV_VV_VF; -defm PseudoVFSGNJN : VPseudoBinaryV_VV_VF; -defm PseudoVFSGNJX : VPseudoBinaryV_VV_VF; +defm PseudoVFSGNJ : VPseudoVSGNJ_VV_VF; +defm PseudoVFSGNJN : VPseudoVSGNJ_VV_VF; +defm PseudoVFSGNJX : VPseudoVSGNJ_VV_VF; //===----------------------------------------------------------------------===// // 14.13. Vector Floating-Point Compare Instructions //===----------------------------------------------------------------------===// -defm PseudoVMFEQ : VPseudoBinaryM_VV_VF; -defm PseudoVMFNE : VPseudoBinaryM_VV_VF; -defm PseudoVMFLT : VPseudoBinaryM_VV_VF; -defm PseudoVMFLE : VPseudoBinaryM_VV_VF; -defm PseudoVMFGT : VPseudoBinaryM_VF; -defm PseudoVMFGE : VPseudoBinaryM_VF; +defm PseudoVMFEQ : VPseudoVCMPM_VV_VF; +defm PseudoVMFNE : VPseudoVCMPM_VV_VF; +defm PseudoVMFLT : VPseudoVCMPM_VV_VF; +defm PseudoVMFLE : VPseudoVCMPM_VV_VF; +defm PseudoVMFGT : VPseudoVCMPM_VF; +defm PseudoVMFGE : VPseudoVCMPM_VF; //===----------------------------------------------------------------------===// // 14.14. Vector Floating-Point Classify Instruction //===----------------------------------------------------------------------===// -defm PseudoVFCLASS : VPseudoUnaryV_V; +defm PseudoVFCLASS : VPseudoVCLS_V; //===----------------------------------------------------------------------===// // 14.15. Vector Floating-Point Merge Instruction //===----------------------------------------------------------------------===// -defm PseudoVFMERGE : VPseudoBinaryV_FM; +defm PseudoVFMERGE : VPseudoVMRG_FM; //===----------------------------------------------------------------------===// // 14.16. Vector Floating-Point Move Instruction //===----------------------------------------------------------------------===// -defm PseudoVFMV_V : VPseudoUnaryV_F_NoDummyMask; +defm PseudoVFMV_V : VPseudoVMV_F; //===----------------------------------------------------------------------===// // 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions //===----------------------------------------------------------------------===// -defm PseudoVFCVT_XU_F : VPseudoConversionV_V; -defm PseudoVFCVT_X_F : VPseudoConversionV_V; -defm PseudoVFCVT_RTZ_XU_F : VPseudoConversionV_V; -defm PseudoVFCVT_RTZ_X_F : VPseudoConversionV_V; -defm PseudoVFCVT_F_XU : VPseudoConversionV_V; -defm PseudoVFCVT_F_X : VPseudoConversionV_V; +defm PseudoVFCVT_XU_F : VPseudoVCVTI_V; +defm PseudoVFCVT_X_F : VPseudoVCVTI_V; +defm PseudoVFCVT_RTZ_XU_F : VPseudoVCVTI_V; +defm PseudoVFCVT_RTZ_X_F : VPseudoVCVTI_V; +defm PseudoVFCVT_F_XU : VPseudoVCVTF_V; +defm PseudoVFCVT_F_X : VPseudoVCVTF_V; //===----------------------------------------------------------------------===// // 14.18. Widening Floating-Point/Integer Type-Convert Instructions //===----------------------------------------------------------------------===// -defm PseudoVFWCVT_XU_F : VPseudoConversionW_V; -defm PseudoVFWCVT_X_F : VPseudoConversionW_V; -defm PseudoVFWCVT_RTZ_XU_F : VPseudoConversionW_V; -defm PseudoVFWCVT_RTZ_X_F : VPseudoConversionW_V; -defm PseudoVFWCVT_F_XU : VPseudoConversionW_V; -defm PseudoVFWCVT_F_X : VPseudoConversionW_V; -defm PseudoVFWCVT_F_F : VPseudoConversionW_V; +defm PseudoVFWCVT_XU_F : VPseudoVWCVTI_V; +defm PseudoVFWCVT_X_F : VPseudoVWCVTI_V; +defm PseudoVFWCVT_RTZ_XU_F : VPseudoVWCVTI_V; +defm PseudoVFWCVT_RTZ_X_F : VPseudoVWCVTI_V; +defm PseudoVFWCVT_F_XU : VPseudoVWCVTF_V; +defm PseudoVFWCVT_F_X : VPseudoVWCVTF_V; +defm PseudoVFWCVT_F_F : VPseudoVWCVTD_V; //===----------------------------------------------------------------------===// // 14.19. Narrowing Floating-Point/Integer Type-Convert Instructions //===----------------------------------------------------------------------===// -defm PseudoVFNCVT_XU_F : VPseudoConversionV_W; -defm PseudoVFNCVT_X_F : VPseudoConversionV_W; -defm PseudoVFNCVT_RTZ_XU_F : VPseudoConversionV_W; -defm PseudoVFNCVT_RTZ_X_F : VPseudoConversionV_W; -defm PseudoVFNCVT_F_XU : VPseudoConversionV_W; -defm PseudoVFNCVT_F_X : VPseudoConversionV_W; -defm PseudoVFNCVT_F_F : VPseudoConversionV_W; -defm PseudoVFNCVT_ROD_F_F : VPseudoConversionV_W; +defm PseudoVFNCVT_XU_F : VPseudoVNCVTI_W; +defm PseudoVFNCVT_X_F : VPseudoVNCVTI_W; +defm PseudoVFNCVT_RTZ_XU_F : VPseudoVNCVTI_W; +defm PseudoVFNCVT_RTZ_X_F : VPseudoVNCVTI_W; +defm PseudoVFNCVT_F_XU : VPseudoVNCVTF_W; +defm PseudoVFNCVT_F_X : VPseudoVNCVTF_W; +defm PseudoVFNCVT_F_F : VPseudoVNCVTD_W; +defm PseudoVFNCVT_ROD_F_F : VPseudoVNCVTD_W; } // Predicates = [HasVInstructionsAnyF] let Predicates = [HasVInstructions] in { //===----------------------------------------------------------------------===// // 15.1. Vector Single-Width Integer Reduction Instructions //===----------------------------------------------------------------------===// -defm PseudoVREDSUM : VPseudoReductionV_VS; -defm PseudoVREDAND : VPseudoReductionV_VS; -defm PseudoVREDOR : VPseudoReductionV_VS; -defm PseudoVREDXOR : VPseudoReductionV_VS; -defm PseudoVREDMINU : VPseudoReductionV_VS; -defm PseudoVREDMIN : VPseudoReductionV_VS; -defm PseudoVREDMAXU : VPseudoReductionV_VS; -defm PseudoVREDMAX : VPseudoReductionV_VS; +defm PseudoVREDSUM : VPseudoVRED_VS; +defm PseudoVREDAND : VPseudoVRED_VS; +defm PseudoVREDOR : VPseudoVRED_VS; +defm PseudoVREDXOR : VPseudoVRED_VS; +defm PseudoVREDMINU : VPseudoVRED_VS; +defm PseudoVREDMIN : VPseudoVRED_VS; +defm PseudoVREDMAXU : VPseudoVRED_VS; +defm PseudoVREDMAX : VPseudoVRED_VS; //===----------------------------------------------------------------------===// // 15.2. Vector Widening Integer Reduction Instructions //===----------------------------------------------------------------------===// let IsRVVWideningReduction = 1 in { -defm PseudoVWREDSUMU : VPseudoReductionV_VS; -defm PseudoVWREDSUM : VPseudoReductionV_VS; +defm PseudoVWREDSUMU : VPseudoVWRED_VS; +defm PseudoVWREDSUM : VPseudoVWRED_VS; } } // Predicates = [HasVInstructions] @@ -3982,17 +4219,17 @@ let Predicates = [HasVInstructionsAnyF] in { //===----------------------------------------------------------------------===// // 15.3. Vector Single-Width Floating-Point Reduction Instructions //===----------------------------------------------------------------------===// -defm PseudoVFREDOSUM : VPseudoReductionV_VS; -defm PseudoVFREDUSUM : VPseudoReductionV_VS; -defm PseudoVFREDMIN : VPseudoReductionV_VS; -defm PseudoVFREDMAX : VPseudoReductionV_VS; +defm PseudoVFREDOSUM : VPseudoVFREDO_VS; +defm PseudoVFREDUSUM : VPseudoVFRED_VS; +defm PseudoVFREDMIN : VPseudoVFRED_VS; +defm PseudoVFREDMAX : VPseudoVFRED_VS; //===----------------------------------------------------------------------===// // 15.4. Vector Widening Floating-Point Reduction Instructions //===----------------------------------------------------------------------===// let IsRVVWideningReduction = 1 in { -defm PseudoVFWREDUSUM : VPseudoReductionV_VS; -defm PseudoVFWREDOSUM : VPseudoReductionV_VS; +defm PseudoVFWREDUSUM : VPseudoVFWRED_VS; +defm PseudoVFWREDOSUM : VPseudoVFWRED_VS; } } // Predicates = [HasVInstructionsAnyF] @@ -4005,55 +4242,57 @@ defm PseudoVFWREDOSUM : VPseudoReductionV_VS; // 16.1 Vector Mask-Register Logical Instructions //===----------------------------------------------------------------------===// -defm PseudoVMAND: VPseudoBinaryM_MM; -defm PseudoVMNAND: VPseudoBinaryM_MM; -defm PseudoVMANDN: VPseudoBinaryM_MM; -defm PseudoVMXOR: VPseudoBinaryM_MM; -defm PseudoVMOR: VPseudoBinaryM_MM; -defm PseudoVMNOR: VPseudoBinaryM_MM; -defm PseudoVMORN: VPseudoBinaryM_MM; -defm PseudoVMXNOR: VPseudoBinaryM_MM; +defm PseudoVMAND: VPseudoVALU_MM; +defm PseudoVMNAND: VPseudoVALU_MM; +defm PseudoVMANDN: VPseudoVALU_MM; +defm PseudoVMXOR: VPseudoVALU_MM; +defm PseudoVMOR: VPseudoVALU_MM; +defm PseudoVMNOR: VPseudoVALU_MM; +defm PseudoVMORN: VPseudoVALU_MM; +defm PseudoVMXNOR: VPseudoVALU_MM; // Pseudo instructions -defm PseudoVMCLR : VPseudoNullaryPseudoM<"VMXOR">; -defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">; +defm PseudoVMCLR : VPseudoNullaryPseudoM<"VMXOR">, + Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>; +defm PseudoVMSET : VPseudoNullaryPseudoM<"VMXNOR">, + Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>; //===----------------------------------------------------------------------===// // 16.2. Vector mask population count vcpop //===----------------------------------------------------------------------===// -defm PseudoVCPOP: VPseudoUnaryS_M; +defm PseudoVCPOP: VPseudoVPOP_M; //===----------------------------------------------------------------------===// // 16.3. vfirst find-first-set mask bit //===----------------------------------------------------------------------===// -defm PseudoVFIRST: VPseudoUnaryS_M; +defm PseudoVFIRST: VPseudoV1ST_M; //===----------------------------------------------------------------------===// // 16.4. vmsbf.m set-before-first mask bit //===----------------------------------------------------------------------===// -defm PseudoVMSBF: VPseudoUnaryM_M; +defm PseudoVMSBF: VPseudoVSFS_M; //===----------------------------------------------------------------------===// // 16.5. vmsif.m set-including-first mask bit //===----------------------------------------------------------------------===// -defm PseudoVMSIF: VPseudoUnaryM_M; +defm PseudoVMSIF: VPseudoVSFS_M; //===----------------------------------------------------------------------===// // 16.6. vmsof.m set-only-first mask bit //===----------------------------------------------------------------------===// -defm PseudoVMSOF: VPseudoUnaryM_M; +defm PseudoVMSOF: VPseudoVSFS_M; //===----------------------------------------------------------------------===// // 16.8. Vector Iota Instruction //===----------------------------------------------------------------------===// -defm PseudoVIOTA_M: VPseudoUnaryV_M; +defm PseudoVIOTA_M: VPseudoVIOT_M; //===----------------------------------------------------------------------===// // 16.9. Vector Element Index Instruction //===----------------------------------------------------------------------===// -defm PseudoVID : VPseudoMaskNullaryV; +defm PseudoVID : VPseudoVID_V; //===----------------------------------------------------------------------===// // 17. Vector Permutation Instructions @@ -4068,15 +4307,18 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { foreach m = MxList.m in { let VLMul = m.value in { let HasSEWOp = 1, BaseInstr = VMV_X_S in - def PseudoVMV_X_S # "_" # m.MX: Pseudo<(outs GPR:$rd), - (ins m.vrclass:$rs2, ixlenimm:$sew), - []>, RISCVVPseudo; + def PseudoVMV_X_S # "_" # m.MX: + Pseudo<(outs GPR:$rd), (ins m.vrclass:$rs2, ixlenimm:$sew), []>, + Sched<[WriteVIMovVX, ReadVIMovVX]>, + RISCVVPseudo; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, Constraints = "$rd = $rs1" in def PseudoVMV_S_X # "_" # m.MX: Pseudo<(outs m.vrclass:$rd), (ins m.vrclass:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew), - []>, RISCVVPseudo; + []>, + Sched<[WriteVIMovXV, ReadVIMovXV, ReadVIMovXX]>, + RISCVVPseudo; } } } @@ -4093,17 +4335,19 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { let VLMul = m.value in { let HasSEWOp = 1, BaseInstr = VFMV_F_S in def "PseudoVFMV_" # f.FX # "_S_" # m.MX : - Pseudo<(outs f.fprclass:$rd), - (ins m.vrclass:$rs2, - ixlenimm:$sew), - []>, RISCVVPseudo; + Pseudo<(outs f.fprclass:$rd), + (ins m.vrclass:$rs2, ixlenimm:$sew), []>, + Sched<[WriteVFMovVF, ReadVFMovVF]>, + RISCVVPseudo; let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, Constraints = "$rd = $rs1" in def "PseudoVFMV_S_" # f.FX # "_" # m.MX : Pseudo<(outs m.vrclass:$rd), (ins m.vrclass:$rs1, f.fprclass:$rs2, AVL:$vl, ixlenimm:$sew), - []>, RISCVVPseudo; + []>, + Sched<[WriteVFMovFV, ReadVFMovFV, ReadVFMovFX]>, + RISCVVPseudo; } } } @@ -4114,52 +4358,33 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { // 17.3. Vector Slide Instructions //===----------------------------------------------------------------------===// let Predicates = [HasVInstructions] in { - defm PseudoVSLIDEUP : VPseudoTernaryV_VX_VI<uimm5, "@earlyclobber $rd">; - defm PseudoVSLIDEDOWN : VPseudoTernaryV_VX_VI<uimm5>; - defm PseudoVSLIDE1UP : VPseudoBinaryV_VX<"@earlyclobber $rd">; - defm PseudoVSLIDE1DOWN : VPseudoBinaryV_VX; + defm PseudoVSLIDEUP : VPseudoVSLD_VX_VI<uimm5, "@earlyclobber $rd">; + defm PseudoVSLIDEDOWN : VPseudoVSLD_VX_VI<uimm5>; + defm PseudoVSLIDE1UP : VPseudoVSLD1_VX<"@earlyclobber $rd">; + defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX; } // Predicates = [HasVInstructions] let Predicates = [HasVInstructionsAnyF] in { - defm PseudoVFSLIDE1UP : VPseudoBinaryV_VF<"@earlyclobber $rd">; - defm PseudoVFSLIDE1DOWN : VPseudoBinaryV_VF; + defm PseudoVFSLIDE1UP : VPseudoVSLD1_VF<"@earlyclobber $rd">; + defm PseudoVFSLIDE1DOWN : VPseudoVSLD1_VF; } // Predicates = [HasVInstructionsAnyF] //===----------------------------------------------------------------------===// // 17.4. Vector Register Gather Instructions //===----------------------------------------------------------------------===// -defm PseudoVRGATHER : VPseudoBinaryV_VV_VX_VI<uimm5, "@earlyclobber $rd">; -defm PseudoVRGATHEREI16 : VPseudoBinaryV_VV_EEW</* eew */ 16, "@earlyclobber $rd">; +defm PseudoVRGATHER : VPseudoVGTR_VV_VX_VI<uimm5, "@earlyclobber $rd">; +defm PseudoVRGATHEREI16 : VPseudoVGTR_VV_EEW</* eew */ 16, "@earlyclobber $rd">; //===----------------------------------------------------------------------===// // 17.5. Vector Compress Instruction //===----------------------------------------------------------------------===// -defm PseudoVCOMPRESS : VPseudoUnaryV_V_AnyMask; +defm PseudoVCOMPRESS : VPseudoVCPR_V; //===----------------------------------------------------------------------===// // Patterns. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// 8. Vector AMO Operations -//===----------------------------------------------------------------------===// -let Predicates = [HasStdExtZvamo] in { - defm : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllIntegerVectors>; - defm : VPatAMOV_WD<"int_riscv_vamoadd", "PseudoVAMOADD", AllIntegerVectors>; - defm : VPatAMOV_WD<"int_riscv_vamoxor", "PseudoVAMOXOR", AllIntegerVectors>; - defm : VPatAMOV_WD<"int_riscv_vamoand", "PseudoVAMOAND", AllIntegerVectors>; - defm : VPatAMOV_WD<"int_riscv_vamoor", "PseudoVAMOOR", AllIntegerVectors>; - defm : VPatAMOV_WD<"int_riscv_vamomin", "PseudoVAMOMIN", AllIntegerVectors>; - defm : VPatAMOV_WD<"int_riscv_vamomax", "PseudoVAMOMAX", AllIntegerVectors>; - defm : VPatAMOV_WD<"int_riscv_vamominu", "PseudoVAMOMINU", AllIntegerVectors>; - defm : VPatAMOV_WD<"int_riscv_vamomaxu", "PseudoVAMOMAXU", AllIntegerVectors>; -} // Predicates = [HasStdExtZvamo] - -let Predicates = [HasStdExtZvamo, HasVInstructionsAnyF] in { - defm : VPatAMOV_WD<"int_riscv_vamoswap", "PseudoVAMOSWAP", AllFloatVectors>; -} // Predicates = [HasStdExtZvamo, HasVInstructionsAnyF] - -//===----------------------------------------------------------------------===// // 12. Vector Integer Arithmetic Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td index 461bdd348934..7eb8ae7d4193 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td @@ -382,50 +382,50 @@ def FSRI : RVBTernaryImm6<0b101, OPC_OP_IMM, "fsri", } // Predicates = [HasStdExtZbt] let Predicates = [HasStdExtZbb] in { -def CLZ : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0010011>, "clz">, +def CLZ : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM, "clz">, Sched<[WriteCLZ, ReadCLZ]>; -def CTZ : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0010011>, "ctz">, +def CTZ : RVBUnary<0b0110000, 0b00001, 0b001, OPC_OP_IMM, "ctz">, Sched<[WriteCTZ, ReadCTZ]>; -def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "cpop">, +def CPOP : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM, "cpop">, Sched<[WriteCPOP, ReadCPOP]>; } // Predicates = [HasStdExtZbb] let Predicates = [HasStdExtZbm, IsRV64] in -def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, RISCVOpcode<0b0010011>, - "bmatflip">, Sched<[]>; +def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, OPC_OP_IMM, "bmatflip">, + Sched<[]>; let Predicates = [HasStdExtZbb] in { -def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, RISCVOpcode<0b0010011>, - "sext.b">, Sched<[WriteIALU, ReadIALU]>; -def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, RISCVOpcode<0b0010011>, - "sext.h">, Sched<[WriteIALU, ReadIALU]>; +def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, OPC_OP_IMM, "sext.b">, + Sched<[WriteIALU, ReadIALU]>; +def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, OPC_OP_IMM, "sext.h">, + Sched<[WriteIALU, ReadIALU]>; } // Predicates = [HasStdExtZbb] let Predicates = [HasStdExtZbr] in { -def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, RISCVOpcode<0b0010011>, - "crc32.b">, Sched<[]>; -def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, RISCVOpcode<0b0010011>, - "crc32.h">, Sched<[]>; -def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, RISCVOpcode<0b0010011>, - "crc32.w">, Sched<[]>; +def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, OPC_OP_IMM, "crc32.b">, + Sched<[]>; +def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, OPC_OP_IMM, "crc32.h">, + Sched<[]>; +def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, OPC_OP_IMM, "crc32.w">, + Sched<[]>; } // Predicates = [HasStdExtZbr] let Predicates = [HasStdExtZbr, IsRV64] in -def CRC32D : RVBUnary<0b0110000, 0b10011, 0b001, RISCVOpcode<0b0010011>, - "crc32.d">, Sched<[]>; +def CRC32D : RVBUnary<0b0110000, 0b10011, 0b001, OPC_OP_IMM, "crc32.d">, + Sched<[]>; let Predicates = [HasStdExtZbr] in { -def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, RISCVOpcode<0b0010011>, - "crc32c.b">, Sched<[]>; -def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, RISCVOpcode<0b0010011>, - "crc32c.h">, Sched<[]>; -def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, RISCVOpcode<0b0010011>, - "crc32c.w">, Sched<[]>; +def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, OPC_OP_IMM, "crc32c.b">, + Sched<[]>; +def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, OPC_OP_IMM, "crc32c.h">, + Sched<[]>; +def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, OPC_OP_IMM, "crc32c.w">, + Sched<[]>; } // Predicates = [HasStdExtZbr] let Predicates = [HasStdExtZbr, IsRV64] in -def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, RISCVOpcode<0b0010011>, - "crc32c.d">, Sched<[]>; +def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">, + Sched<[]>; let Predicates = [HasStdExtZbc] in { def CLMUL : ALU_rr<0b0000101, 0b001, "clmul">, Sched<[]>; @@ -523,12 +523,12 @@ def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32, } // Predicates = [HasStdExtZbt, IsRV64] let Predicates = [HasStdExtZbb, IsRV64] in { -def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0011011>, - "clzw">, Sched<[WriteCLZ32, ReadCLZ32]>; -def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0011011>, - "ctzw">, Sched<[WriteCTZ32, ReadCTZ32]>; -def CPOPW : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>, - "cpopw">, Sched<[WriteCPOP32, ReadCPOP32]>; +def CLZW : RVBUnary<0b0110000, 0b00000, 0b001, OPC_OP_IMM_32, "clzw">, + Sched<[WriteCLZ32, ReadCLZ32]>; +def CTZW : RVBUnary<0b0110000, 0b00001, 0b001, OPC_OP_IMM_32, "ctzw">, + Sched<[WriteCTZ32, ReadCTZ32]>; +def CPOPW : RVBUnary<0b0110000, 0b00010, 0b001, OPC_OP_IMM_32, "cpopw">, + Sched<[WriteCPOP32, ReadCPOP32]>; } // Predicates = [HasStdExtZbb, IsRV64] let Predicates = [HasStdExtZbp, IsRV64] in { @@ -791,6 +791,9 @@ def : Pat<(xor GPR:$rs1, BSETINVMask:$mask), def : Pat<(and (srl GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1)), (BEXTI GPR:$rs1, uimmlog2xlen:$shamt)>; +def : Pat<(and (not (srl GPR:$rs1, uimmlog2xlen:$shamt)), (XLenVT 1)), + (XORI (BEXTI GPR:$rs1, uimmlog2xlen:$shamt), (XLenVT 1))>; + def : Pat<(or GPR:$r, BSETINVTwoBitsMask:$i), (BSETI (BSETI GPR:$r, (TrailingZerosXForm BSETINVTwoBitsMask:$i)), (BSETINVTwoBitsMaskHigh BSETINVTwoBitsMask:$i))>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td index a33494461869..663e44813899 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td @@ -28,41 +28,6 @@ def riscv_fmv_x_anyexth : SDNode<"RISCVISD::FMV_X_ANYEXTH", SDT_RISCVFMV_X_ANYEXTH>; //===----------------------------------------------------------------------===// -// Instruction class templates -//===----------------------------------------------------------------------===// - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPFMAH_rrr_frm<RISCVOpcode opcode, string opcodestr> - : RVInstR4Frm<0b10, opcode, (outs FPR16:$rd), - (ins FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, frmarg:$funct3), - opcodestr, "$rd, $rs1, $rs2, $rs3, $funct3">; - -class FPFMAHDynFrmAlias<FPFMAH_rrr_frm Inst, string OpcodeStr> - : InstAlias<OpcodeStr#" $rd, $rs1, $rs2, $rs3", - (Inst FPR16:$rd, FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>; - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPALUH_rr<bits<7> funct7, bits<3> funct3, string opcodestr> - : RVInstR<funct7, funct3, OPC_OP_FP, (outs FPR16:$rd), - (ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">; - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPALUH_rr_frm<bits<7> funct7, string opcodestr> - : RVInstRFrm<funct7, OPC_OP_FP, (outs FPR16:$rd), - (ins FPR16:$rs1, FPR16:$rs2, frmarg:$funct3), opcodestr, - "$rd, $rs1, $rs2, $funct3">; - -class FPALUHDynFrmAlias<FPALUH_rr_frm Inst, string OpcodeStr> - : InstAlias<OpcodeStr#" $rd, $rs1, $rs2", - (Inst FPR16:$rd, FPR16:$rs1, FPR16:$rs2, 0b111)>; - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class FPCmpH_rr<bits<3> funct3, string opcodestr> - : RVInstR<0b1010010, funct3, OPC_OP_FP, (outs GPR:$rd), - (ins FPR16:$rs1, FPR16:$rs2), opcodestr, "$rd, $rs1, $rs2">, - Sched<[WriteFCmp16, ReadFCmp16, ReadFCmp16]>; - -//===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -84,145 +49,120 @@ def FSH : RVInstS<0b001, OPC_STORE_FP, (outs), } // Predicates = [HasStdExtZfhmin] let Predicates = [HasStdExtZfh] in { -def FMADD_H : FPFMAH_rrr_frm<OPC_MADD, "fmadd.h">, - Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>; -def : FPFMAHDynFrmAlias<FMADD_H, "fmadd.h">; -def FMSUB_H : FPFMAH_rrr_frm<OPC_MSUB, "fmsub.h">, - Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>; -def : FPFMAHDynFrmAlias<FMSUB_H, "fmsub.h">; -def FNMSUB_H : FPFMAH_rrr_frm<OPC_NMSUB, "fnmsub.h">, - Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>; -def : FPFMAHDynFrmAlias<FNMSUB_H, "fnmsub.h">; -def FNMADD_H : FPFMAH_rrr_frm<OPC_NMADD, "fnmadd.h">, - Sched<[WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16]>; -def : FPFMAHDynFrmAlias<FNMADD_H, "fnmadd.h">; +let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16] in { +def FMADD_H : FPFMA_rrr_frm<OPC_MADD, 0b10, "fmadd.h", FPR16>; +def FMSUB_H : FPFMA_rrr_frm<OPC_MSUB, 0b10, "fmsub.h", FPR16>; +def FNMSUB_H : FPFMA_rrr_frm<OPC_NMSUB, 0b10, "fnmsub.h", FPR16>; +def FNMADD_H : FPFMA_rrr_frm<OPC_NMADD, 0b10, "fnmadd.h", FPR16>; +} -def FADD_H : FPALUH_rr_frm<0b0000010, "fadd.h">, +def : FPFMADynFrmAlias<FMADD_H, "fmadd.h", FPR16>; +def : FPFMADynFrmAlias<FMSUB_H, "fmsub.h", FPR16>; +def : FPFMADynFrmAlias<FNMSUB_H, "fnmsub.h", FPR16>; +def : FPFMADynFrmAlias<FNMADD_H, "fnmadd.h", FPR16>; + +def FADD_H : FPALU_rr_frm<0b0000010, "fadd.h", FPR16>, Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>; -def : FPALUHDynFrmAlias<FADD_H, "fadd.h">; -def FSUB_H : FPALUH_rr_frm<0b0000110, "fsub.h">, +def FSUB_H : FPALU_rr_frm<0b0000110, "fsub.h", FPR16>, Sched<[WriteFALU16, ReadFALU16, ReadFALU16]>; -def : FPALUHDynFrmAlias<FSUB_H, "fsub.h">; -def FMUL_H : FPALUH_rr_frm<0b0001010, "fmul.h">, +def FMUL_H : FPALU_rr_frm<0b0001010, "fmul.h", FPR16>, Sched<[WriteFMul16, ReadFMul16, ReadFMul16]>; -def : FPALUHDynFrmAlias<FMUL_H, "fmul.h">; -def FDIV_H : FPALUH_rr_frm<0b0001110, "fdiv.h">, +def FDIV_H : FPALU_rr_frm<0b0001110, "fdiv.h", FPR16>, Sched<[WriteFDiv16, ReadFDiv16, ReadFDiv16]>; -def : FPALUHDynFrmAlias<FDIV_H, "fdiv.h">; -def FSQRT_H : FPUnaryOp_r_frm<0b0101110, FPR16, FPR16, "fsqrt.h">, - Sched<[WriteFSqrt16, ReadFSqrt16]> { - let rs2 = 0b00000; -} +def : FPALUDynFrmAlias<FADD_H, "fadd.h", FPR16>; +def : FPALUDynFrmAlias<FSUB_H, "fsub.h", FPR16>; +def : FPALUDynFrmAlias<FMUL_H, "fmul.h", FPR16>; +def : FPALUDynFrmAlias<FDIV_H, "fdiv.h", FPR16>; + +def FSQRT_H : FPUnaryOp_r_frm<0b0101110, 0b00000, FPR16, FPR16, "fsqrt.h">, + Sched<[WriteFSqrt16, ReadFSqrt16]>; def : FPUnaryOpDynFrmAlias<FSQRT_H, "fsqrt.h", FPR16, FPR16>; -def FSGNJ_H : FPALUH_rr<0b0010010, 0b000, "fsgnj.h">, - Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>; -def FSGNJN_H : FPALUH_rr<0b0010010, 0b001, "fsgnjn.h">, - Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>; -def FSGNJX_H : FPALUH_rr<0b0010010, 0b010, "fsgnjx.h">, - Sched<[WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16]>; -def FMIN_H : FPALUH_rr<0b0010110, 0b000, "fmin.h">, - Sched<[WriteFMinMax16, ReadFMinMax16, ReadFMinMax16]>; -def FMAX_H : FPALUH_rr<0b0010110, 0b001, "fmax.h">, - Sched<[WriteFMinMax16, ReadFMinMax16, ReadFMinMax16]>; +let SchedRW = [WriteFSGNJ16, ReadFSGNJ16, ReadFSGNJ16], + mayRaiseFPException = 0 in { +def FSGNJ_H : FPALU_rr<0b0010010, 0b000, "fsgnj.h", FPR16>; +def FSGNJN_H : FPALU_rr<0b0010010, 0b001, "fsgnjn.h", FPR16>; +def FSGNJX_H : FPALU_rr<0b0010010, 0b010, "fsgnjx.h", FPR16>; +} -def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.w.h">, - Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]> { - let rs2 = 0b00000; +let SchedRW = [WriteFMinMax16, ReadFMinMax16, ReadFMinMax16] in { +def FMIN_H : FPALU_rr<0b0010110, 0b000, "fmin.h", FPR16>; +def FMAX_H : FPALU_rr<0b0010110, 0b001, "fmax.h", FPR16>; } + +def FCVT_W_H : FPUnaryOp_r_frm<0b1100010, 0b00000, GPR, FPR16, "fcvt.w.h">, + Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>; def : FPUnaryOpDynFrmAlias<FCVT_W_H, "fcvt.w.h", GPR, FPR16>; -def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.wu.h">, - Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]> { - let rs2 = 0b00001; -} +def FCVT_WU_H : FPUnaryOp_r_frm<0b1100010, 0b00001, GPR, FPR16, "fcvt.wu.h">, + Sched<[WriteFCvtF16ToI32, ReadFCvtF16ToI32]>; def : FPUnaryOpDynFrmAlias<FCVT_WU_H, "fcvt.wu.h", GPR, FPR16>; -def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.w">, - Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]> { - let rs2 = 0b00000; -} +def FCVT_H_W : FPUnaryOp_r_frm<0b1101010, 0b00000, FPR16, GPR, "fcvt.h.w">, + Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>; def : FPUnaryOpDynFrmAlias<FCVT_H_W, "fcvt.h.w", FPR16, GPR>; -def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.wu">, - Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]> { - let rs2 = 0b00001; -} +def FCVT_H_WU : FPUnaryOp_r_frm<0b1101010, 0b00001, FPR16, GPR, "fcvt.h.wu">, + Sched<[WriteFCvtI32ToF16, ReadFCvtI32ToF16]>; def : FPUnaryOpDynFrmAlias<FCVT_H_WU, "fcvt.h.wu", FPR16, GPR>; } // Predicates = [HasStdExtZfh] let Predicates = [HasStdExtZfhmin] in { -def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, FPR16, FPR32, "fcvt.h.s">, - Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]> { - let rs2 = 0b00000; -} +def FCVT_H_S : FPUnaryOp_r_frm<0b0100010, 0b00000, FPR16, FPR32, "fcvt.h.s">, + Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>; def : FPUnaryOpDynFrmAlias<FCVT_H_S, "fcvt.h.s", FPR16, FPR32>; -def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b000, FPR32, FPR16, "fcvt.s.h">, - Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]> { - let rs2 = 0b00010; -} +def FCVT_S_H : FPUnaryOp_r<0b0100000, 0b00010, 0b000, FPR32, FPR16, "fcvt.s.h">, + Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]>; -def FMV_X_H : FPUnaryOp_r<0b1110010, 0b000, GPR, FPR16, "fmv.x.h">, - Sched<[WriteFMovF16ToI16, ReadFMovF16ToI16]> { - let rs2 = 0b00000; -} +let mayRaiseFPException = 0 in +def FMV_X_H : FPUnaryOp_r<0b1110010, 0b00000, 0b000, GPR, FPR16, "fmv.x.h">, + Sched<[WriteFMovF16ToI16, ReadFMovF16ToI16]>; -def FMV_H_X : FPUnaryOp_r<0b1111010, 0b000, FPR16, GPR, "fmv.h.x">, - Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]> { - let rs2 = 0b00000; -} +let mayRaiseFPException = 0 in +def FMV_H_X : FPUnaryOp_r<0b1111010, 0b00000, 0b000, FPR16, GPR, "fmv.h.x">, + Sched<[WriteFMovI16ToF16, ReadFMovI16ToF16]>; } // Predicates = [HasStdExtZfhmin] let Predicates = [HasStdExtZfh] in { -def FEQ_H : FPCmpH_rr<0b010, "feq.h">; -def FLT_H : FPCmpH_rr<0b001, "flt.h">; -def FLE_H : FPCmpH_rr<0b000, "fle.h">; -def FCLASS_H : FPUnaryOp_r<0b1110010, 0b001, GPR, FPR16, "fclass.h">, - Sched<[WriteFClass16, ReadFClass16]> { - let rs2 = 0b00000; +let SchedRW = [WriteFCmp16, ReadFCmp16, ReadFCmp16] in { +def FEQ_H : FPCmp_rr<0b1010010, 0b010, "feq.h", FPR16>; +def FLT_H : FPCmp_rr<0b1010010, 0b001, "flt.h", FPR16>; +def FLE_H : FPCmp_rr<0b1010010, 0b000, "fle.h", FPR16>; } + +let mayRaiseFPException = 0 in +def FCLASS_H : FPUnaryOp_r<0b1110010, 0b00000, 0b001, GPR, FPR16, "fclass.h">, + Sched<[WriteFClass16, ReadFClass16]>; } // Predicates = [HasStdExtZfh] let Predicates = [HasStdExtZfh, IsRV64] in { -def FCVT_L_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.l.h">, - Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]> { - let rs2 = 0b00010; -} +def FCVT_L_H : FPUnaryOp_r_frm<0b1100010, 0b00010, GPR, FPR16, "fcvt.l.h">, + Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>; def : FPUnaryOpDynFrmAlias<FCVT_L_H, "fcvt.l.h", GPR, FPR16>; -def FCVT_LU_H : FPUnaryOp_r_frm<0b1100010, GPR, FPR16, "fcvt.lu.h">, - Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]> { - let rs2 = 0b00011; -} +def FCVT_LU_H : FPUnaryOp_r_frm<0b1100010, 0b00011, GPR, FPR16, "fcvt.lu.h">, + Sched<[WriteFCvtF16ToI64, ReadFCvtF16ToI64]>; def : FPUnaryOpDynFrmAlias<FCVT_LU_H, "fcvt.lu.h", GPR, FPR16>; -def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.l">, - Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]> { - let rs2 = 0b00010; -} +def FCVT_H_L : FPUnaryOp_r_frm<0b1101010, 0b00010, FPR16, GPR, "fcvt.h.l">, + Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>; def : FPUnaryOpDynFrmAlias<FCVT_H_L, "fcvt.h.l", FPR16, GPR>; -def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, FPR16, GPR, "fcvt.h.lu">, - Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]> { - let rs2 = 0b00011; -} +def FCVT_H_LU : FPUnaryOp_r_frm<0b1101010, 0b00011, FPR16, GPR, "fcvt.h.lu">, + Sched<[WriteFCvtI64ToF16, ReadFCvtI64ToF16]>; def : FPUnaryOpDynFrmAlias<FCVT_H_LU, "fcvt.h.lu", FPR16, GPR>; } // Predicates = [HasStdExtZfh, IsRV64] let Predicates = [HasStdExtZfhmin, HasStdExtD] in { -def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, FPR16, FPR64, "fcvt.h.d">, - Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]> { - let rs2 = 0b00001; -} +def FCVT_H_D : FPUnaryOp_r_frm<0b0100010, 0b00001, FPR16, FPR64, "fcvt.h.d">, + Sched<[WriteFCvtF64ToF16, ReadFCvtF64ToF16]>; def : FPUnaryOpDynFrmAlias<FCVT_H_D, "fcvt.h.d", FPR16, FPR64>; -def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR16, "fcvt.d.h">, - Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]> { - let rs2 = 0b00010; -} +def FCVT_D_H : FPUnaryOp_r<0b0100001, 0b00010, 0b000, FPR64, FPR16, "fcvt.d.h">, + Sched<[WriteFCvtF16ToF64, ReadFCvtF16ToF64]>; } // Predicates = [HasStdExtZfhmin, HasStdExtD] //===----------------------------------------------------------------------===// @@ -275,12 +215,12 @@ def : Pat<(f16 (fpimm0)), (FMV_H_X X0)>; /// Float arithmetic operations -def : PatFpr16Fpr16DynFrm<fadd, FADD_H>; -def : PatFpr16Fpr16DynFrm<fsub, FSUB_H>; -def : PatFpr16Fpr16DynFrm<fmul, FMUL_H>; -def : PatFpr16Fpr16DynFrm<fdiv, FDIV_H>; +def : PatFpr16Fpr16DynFrm<any_fadd, FADD_H>; +def : PatFpr16Fpr16DynFrm<any_fsub, FSUB_H>; +def : PatFpr16Fpr16DynFrm<any_fmul, FMUL_H>; +def : PatFpr16Fpr16DynFrm<any_fdiv, FDIV_H>; -def : Pat<(fsqrt FPR16:$rs1), (FSQRT_H FPR16:$rs1, 0b111)>; +def : Pat<(any_fsqrt FPR16:$rs1), (FSQRT_H FPR16:$rs1, 0b111)>; def : Pat<(fneg FPR16:$rs1), (FSGNJN_H $rs1, $rs1)>; def : Pat<(fabs FPR16:$rs1), (FSGNJX_H $rs1, $rs1)>; @@ -292,19 +232,19 @@ def : Pat<(fcopysign FPR16:$rs1, FPR32:$rs2), def : Pat<(fcopysign FPR32:$rs1, FPR16:$rs2), (FSGNJ_S $rs1, (FCVT_S_H $rs2))>; // fmadd: rs1 * rs2 + rs3 -def : Pat<(fma FPR16:$rs1, FPR16:$rs2, FPR16:$rs3), +def : Pat<(any_fma FPR16:$rs1, FPR16:$rs2, FPR16:$rs3), (FMADD_H $rs1, $rs2, $rs3, 0b111)>; // fmsub: rs1 * rs2 - rs3 -def : Pat<(fma FPR16:$rs1, FPR16:$rs2, (fneg FPR16:$rs3)), +def : Pat<(any_fma FPR16:$rs1, FPR16:$rs2, (fneg FPR16:$rs3)), (FMSUB_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>; // fnmsub: -rs1 * rs2 + rs3 -def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, FPR16:$rs3), +def : Pat<(any_fma (fneg FPR16:$rs1), FPR16:$rs2, FPR16:$rs3), (FNMSUB_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>; // fnmadd: -rs1 * rs2 - rs3 -def : Pat<(fma (fneg FPR16:$rs1), FPR16:$rs2, (fneg FPR16:$rs3)), +def : Pat<(any_fma (fneg FPR16:$rs1), FPR16:$rs2, (fneg FPR16:$rs3)), (FNMADD_H FPR16:$rs1, FPR16:$rs2, FPR16:$rs3, 0b111)>; // The ratified 20191213 ISA spec defines fmin and fmax in a way that matches @@ -337,8 +277,8 @@ defm : StPat<store, FSH, FPR16, f16>; /// Float conversion operations // f32 -> f16, f16 -> f32 -def : Pat<(fpround FPR32:$rs1), (FCVT_H_S FPR32:$rs1, 0b111)>; -def : Pat<(fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>; +def : Pat<(any_fpround FPR32:$rs1), (FCVT_H_S FPR32:$rs1, 0b111)>; +def : Pat<(any_fpextend FPR16:$rs1), (FCVT_S_H FPR16:$rs1)>; // Moves (no conversion) def : Pat<(riscv_fmv_h_x GPR:$src), (FMV_H_X GPR:$src)>; @@ -347,8 +287,8 @@ def : Pat<(riscv_fmv_x_anyexth FPR16:$src), (FMV_X_H FPR16:$src)>; let Predicates = [HasStdExtZfh, IsRV32] in { // half->[u]int. Round-to-zero must be used. -def : Pat<(i32 (fp_to_sint FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>; -def : Pat<(i32 (fp_to_uint FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>; +def : Pat<(i32 (any_fp_to_sint FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>; +def : Pat<(i32 (any_fp_to_uint FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>; // Saturating float->[u]int32. def : Pat<(i32 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>; @@ -361,20 +301,20 @@ def : Pat<(i32 (lrint FPR16:$rs1)), (FCVT_W_H $rs1, 0b111)>; def : Pat<(i32 (lround FPR16:$rs1)), (FCVT_W_H $rs1, 0b100)>; // [u]int->half. Match GCC and default to using dynamic rounding mode. -def : Pat<(sint_to_fp (i32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>; -def : Pat<(uint_to_fp (i32 GPR:$rs1)), (FCVT_H_WU $rs1, 0b111)>; +def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>; +def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_H_WU $rs1, 0b111)>; } // Predicates = [HasStdExtZfh, IsRV32] let Predicates = [HasStdExtZfh, IsRV64] in { // Use target specific isd nodes to help us remember the result is sign // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be // duplicated if it has another user that didn't need the sign_extend. -def : Pat<(riscv_fcvt_w_rtz_rv64 FPR16:$rs1), (FCVT_W_H $rs1, 0b001)>; -def : Pat<(riscv_fcvt_wu_rtz_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR16:$rs1), (FCVT_W_H $rs1, 0b001)>; +def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>; // half->[u]int64. Round-to-zero must be used. -def : Pat<(i64 (fp_to_sint FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>; -def : Pat<(i64 (fp_to_uint FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>; +def : Pat<(i64 (any_fp_to_sint FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>; +def : Pat<(i64 (any_fp_to_uint FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>; // Saturating float->[u]int64. def : Pat<(i64 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>; @@ -389,17 +329,17 @@ def : Pat<(i64 (lround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>; def : Pat<(i64 (llround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>; // [u]int->fp. Match GCC and default to using dynamic rounding mode. -def : Pat<(sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_H_W $rs1, 0b111)>; -def : Pat<(uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_H_WU $rs1, 0b111)>; -def : Pat<(sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L $rs1, 0b111)>; -def : Pat<(uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU $rs1, 0b111)>; +def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_H_W $rs1, 0b111)>; +def : Pat<(any_uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_H_WU $rs1, 0b111)>; +def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L $rs1, 0b111)>; +def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU $rs1, 0b111)>; } // Predicates = [HasStdExtZfh, IsRV64] let Predicates = [HasStdExtZfhmin, HasStdExtD] in { /// Float conversion operations // f64 -> f16, f16 -> f64 -def : Pat<(fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>; -def : Pat<(fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>; +def : Pat<(any_fpround FPR64:$rs1), (FCVT_H_D FPR64:$rs1, 0b111)>; +def : Pat<(any_fpextend FPR16:$rs1), (FCVT_D_H FPR16:$rs1)>; /// Float arithmetic operations def : Pat<(fcopysign FPR16:$rs1, FPR64:$rs2), diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 798532d5bc44..9094dff1dda1 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -105,7 +105,6 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Floating point environment registers. markSuperRegs(Reserved, RISCV::FRM); markSuperRegs(Reserved, RISCV::FFLAGS); - markSuperRegs(Reserved, RISCV::FCSR); assert(checkAllSuperRegsMarked(Reserved)); return Reserved; diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index a56f992d320e..20903b317180 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -550,16 +550,15 @@ def VRM8NoV0 : VReg<[vint8m8_t, vint16m8_t, vint32m8_t, vint64m8_t, vfloat16m8_t, vfloat32m8_t, vfloat64m8_t], (add V8M8, V16M8, V24M8), 8>; -defvar VMaskVTs = [vbool64_t, vbool32_t, vbool16_t, vbool8_t, - vbool4_t, vbool2_t, vbool1_t]; +defvar VMaskVTs = [vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t, + vbool32_t, vbool64_t]; def VMV0 : RegisterClass<"RISCV", VMaskVTs, 64, (add V0)> { let Size = 64; } // The register class is added for inline assembly for vector mask types. -def VM : VReg<[vbool1_t, vbool2_t, vbool4_t, vbool8_t, vbool16_t, - vbool32_t, vbool64_t], +def VM : VReg<VMaskVTs, (add (sequence "V%u", 8, 31), (sequence "V%u", 0, 7)), 1>; @@ -578,7 +577,6 @@ foreach m = LMULList.m in { // Special registers def FFLAGS : RISCVReg<0, "fflags">; def FRM : RISCVReg<0, "frm">; -def FCSR : RISCVReg<0, "fcsr">; // Any type register. Used for .insn directives when we don't know what the // register types could be. diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td index 14f59152ed42..d5a0932c8778 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td +++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td @@ -16,7 +16,8 @@ def RocketModel : SchedMachineModel { let IssueWidth = 1; // 1 micro-op is dispatched per cycle. let LoadLatency = 3; let MispredictPenalty = 3; - let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg]; + let CompleteModel = false; + let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg]; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 5b435fcb16a2..7f9d0aabc4ed 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -15,7 +15,7 @@ def SiFive7Model : SchedMachineModel { let LoadLatency = 3; let MispredictPenalty = 3; let CompleteModel = 0; - let UnsupportedFeatures = [HasStdExtV, HasStdExtZvamo, HasStdExtZvlsseg]; + let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg]; } // The SiFive7 microarchitecture has two pipelines: A and B. diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index deb2a11f98f1..d0330e6984a5 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -51,7 +51,6 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { bool HasStdExtZbt = false; bool HasStdExtV = false; bool HasStdExtZvlsseg = false; - bool HasStdExtZvamo = false; bool HasStdExtZfhmin = false; bool HasStdExtZfh = false; bool HasRV64 = false; @@ -118,7 +117,6 @@ public: bool hasStdExtZbt() const { return HasStdExtZbt; } bool hasStdExtV() const { return HasStdExtV; } bool hasStdExtZvlsseg() const { return HasStdExtZvlsseg; } - bool hasStdExtZvamo() const { return HasStdExtZvamo; } bool hasStdExtZfhmin() const { return HasStdExtZfhmin; } bool hasStdExtZfh() const { return HasStdExtZfh; } bool is64Bit() const { return HasRV64; } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 56f0952fafc9..c435430a1288 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -162,3 +162,94 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost( getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0, CostKind, I); return NumLoads * MemOpCost; } + +void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP, + OptimizationRemarkEmitter *ORE) { + // TODO: More tuning on benchmarks and metrics with changes as needed + // would apply to all settings below to enable performance. + + // Support explicit targets enabled for SiFive with the unrolling preferences + // below + bool UseDefaultPreferences = true; + if (ST->getTuneCPU().contains("sifive-e76") || + ST->getTuneCPU().contains("sifive-s76") || + ST->getTuneCPU().contains("sifive-u74") || + ST->getTuneCPU().contains("sifive-7")) + UseDefaultPreferences = false; + + if (UseDefaultPreferences) + return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE); + + // Enable Upper bound unrolling universally, not dependant upon the conditions + // below. + UP.UpperBound = true; + + // Disable loop unrolling for Oz and Os. + UP.OptSizeThreshold = 0; + UP.PartialOptSizeThreshold = 0; + if (L->getHeader()->getParent()->hasOptSize()) + return; + + SmallVector<BasicBlock *, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + LLVM_DEBUG(dbgs() << "Loop has:\n" + << "Blocks: " << L->getNumBlocks() << "\n" + << "Exit blocks: " << ExitingBlocks.size() << "\n"); + + // Only allow another exit other than the latch. This acts as an early exit + // as it mirrors the profitability calculation of the runtime unroller. + if (ExitingBlocks.size() > 2) + return; + + // Limit the CFG of the loop body for targets with a branch predictor. + // Allowing 4 blocks permits if-then-else diamonds in the body. + if (L->getNumBlocks() > 4) + return; + + // Don't unroll vectorized loops, including the remainder loop + if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) + return; + + // Scan the loop: don't unroll loops with calls as this could prevent + // inlining. + InstructionCost Cost = 0; + for (auto *BB : L->getBlocks()) { + for (auto &I : *BB) { + // Initial setting - Don't unroll loops containing vectorized + // instructions. + if (I.getType()->isVectorTy()) + return; + + if (isa<CallInst>(I) || isa<InvokeInst>(I)) { + if (const Function *F = cast<CallBase>(I).getCalledFunction()) { + if (!isLoweredToCall(F)) + continue; + } + return; + } + + SmallVector<const Value *> Operands(I.operand_values()); + Cost += + getUserCost(&I, Operands, TargetTransformInfo::TCK_SizeAndLatency); + } + } + + LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n"); + + UP.Partial = true; + UP.Runtime = true; + UP.UnrollRemainder = true; + UP.UnrollAndJam = true; + UP.UnrollAndJamInnerLoopThreshold = 60; + + // Force unrolling small loops can be very useful because of the branch + // taken cost of the backedge. + if (Cost < 12) + UP.Force = true; +} + +void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP) { + BaseT::getPeelingPreferences(L, SE, PP); +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 675681616d6e..7353496f4684 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -73,6 +73,13 @@ public: llvm_unreachable("Unsupported register kind"); } + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP, + OptimizationRemarkEmitter *ORE); + + void getPeelingPreferences(Loop *L, ScalarEvolution &SE, + TTI::PeelingPreferences &PP); + unsigned getMinVectorRegisterBitWidth() const { return ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0; } @@ -178,7 +185,9 @@ public: } unsigned getMaxInterleaveFactor(unsigned VF) { - return ST->getMaxInterleaveFactor(); + // If the loop will not be vectorized, don't interleave the loop. + // Let regular unroll to unroll the loop. + return VF == 1 ? 1 : ST->getMaxInterleaveFactor(); } }; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index 0f5e0b9672a9..538380263c3c 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -28,25 +28,43 @@ static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value, if (Kind < FirstTargetFixupKind) return Value; + auto checkFixupInRange = [&](int64_t Min, int64_t Max) -> bool { + int64_t SVal = int64_t(Value); + if (SVal < Min || SVal > Max) { + Ctx.reportError(Fixup.getLoc(), "operand out of range (" + Twine(SVal) + + " not between " + Twine(Min) + + " and " + Twine(Max) + ")"); + return false; + } + return true; + }; + + auto handlePCRelFixupValue = [&](unsigned W) -> uint64_t { + if (Value % 2 != 0) + Ctx.reportError(Fixup.getLoc(), "Non-even PC relative offset."); + if (!checkFixupInRange(minIntN(W) * 2, maxIntN(W) * 2)) + return 0; + return (int64_t)Value / 2; + }; + switch (unsigned(Kind)) { case SystemZ::FK_390_PC12DBL: + return handlePCRelFixupValue(12); case SystemZ::FK_390_PC16DBL: + return handlePCRelFixupValue(16); case SystemZ::FK_390_PC24DBL: + return handlePCRelFixupValue(24); case SystemZ::FK_390_PC32DBL: - return (int64_t)Value / 2; + return handlePCRelFixupValue(32); case SystemZ::FK_390_12: - if (!isUInt<12>(Value)) { - Ctx.reportError(Fixup.getLoc(), "displacement exceeds uint12"); + if (!checkFixupInRange(0, maxUIntN(12))) return 0; - } return Value; case SystemZ::FK_390_20: { - if (!isInt<20>(Value)) { - Ctx.reportError(Fixup.getLoc(), "displacement exceeds int20"); + if (!checkFixupInRange(minIntN(20), maxIntN(20))) return 0; - } // The high byte of a 20 bit displacement value comes first. uint64_t DLo = Value & 0xfff; uint64_t DHi = (Value >> 12) & 0xff; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index e280e4aaf3d8..c83796b8579b 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -197,7 +197,8 @@ getDispOpValue(const MCInst &MI, unsigned OpNum, // All instructions follow the pattern where the first displacement has a // 2 bytes offset, and the second one 4 bytes. unsigned ByteOffs = Fixups.size() == 0 ? 2 : 4; - Fixups.push_back(MCFixup::create(ByteOffs, MO.getExpr(), (MCFixupKind)Kind)); + Fixups.push_back(MCFixup::create(ByteOffs, MO.getExpr(), (MCFixupKind)Kind, + MI.getLoc())); assert(Fixups.size() <= 2 && "More than two memory operands in MI?"); return 0; } @@ -296,6 +297,7 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum, SmallVectorImpl<MCFixup> &Fixups, unsigned Kind, int64_t Offset, bool AllowTLS) const { + SMLoc Loc = MI.getLoc(); const MCOperand &MO = MI.getOperand(OpNum); const MCExpr *Expr; if (MO.isImm()) @@ -311,13 +313,13 @@ SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum, Expr = MCBinaryExpr::createAdd(Expr, OffsetExpr, Ctx); } } - Fixups.push_back(MCFixup::create(Offset, Expr, (MCFixupKind)Kind)); + Fixups.push_back(MCFixup::create(Offset, Expr, (MCFixupKind)Kind, Loc)); // Output the fixup for the TLS marker if present. if (AllowTLS && OpNum + 1 < MI.getNumOperands()) { const MCOperand &MOTLS = MI.getOperand(OpNum + 1); - Fixups.push_back(MCFixup::create(0, MOTLS.getExpr(), - (MCFixupKind)SystemZ::FK_390_TLS_CALL)); + Fixups.push_back(MCFixup::create( + 0, MOTLS.getExpr(), (MCFixupKind)SystemZ::FK_390_TLS_CALL, Loc)); } return 0; } diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 373023effb4a..a7ea5e1e4bf8 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -166,6 +166,7 @@ def CSR_SystemZ_NoRegs : CalleeSavedRegs<(add)>; // any non-leaf function and restored in the epilogue for use by the // return instruction so it functions exactly like a callee-saved register. def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15), + (sequence "R%dD", 4, 4), (sequence "F%dD", 15, 8))>; def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64, diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 2f7cdfcf7bde..99ab4c5455d6 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -818,7 +818,7 @@ bool SystemZELFFrameLowering::usePackedStack(MachineFunction &MF) const { } SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering() - : SystemZFrameLowering(TargetFrameLowering::StackGrowsUp, Align(32), 128, + : SystemZFrameLowering(TargetFrameLowering::StackGrowsDown, Align(32), 0, Align(32), /* StackRealignable */ false), RegSpillOffsets(-1) { @@ -990,12 +990,184 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( return true; } +bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { + + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); + const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); + + DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Restore FPRs in the normal TargetInstrInfo way. + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + unsigned Reg = CSI[I].getReg(); + if (SystemZ::FP64BitRegClass.contains(Reg)) + TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(), + &SystemZ::FP64BitRegClass, TRI); + if (SystemZ::VR128BitRegClass.contains(Reg)) + TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(), + &SystemZ::VR128BitRegClass, TRI); + } + + // Restore call-saved GPRs (but not call-clobbered varargs, which at + // this point might hold return values). + SystemZ::GPRRegs RestoreGPRs = ZFI->getRestoreGPRRegs(); + if (RestoreGPRs.LowGPR) { + assert(isInt<20>(Regs.getStackPointerBias() + RestoreGPRs.GPROffset)); + if (RestoreGPRs.LowGPR == RestoreGPRs.HighGPR) + // Build an LG/L instruction. + BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LG), RestoreGPRs.LowGPR) + .addReg(Regs.getStackPointerRegister()) + .addImm(Regs.getStackPointerBias() + RestoreGPRs.GPROffset) + .addReg(0); + else { + // Build an LMG/LM instruction. + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LMG)); + + // Add the explicit register operands. + MIB.addReg(RestoreGPRs.LowGPR, RegState::Define); + MIB.addReg(RestoreGPRs.HighGPR, RegState::Define); + + // Add the address. + MIB.addReg(Regs.getStackPointerRegister()); + MIB.addImm(Regs.getStackPointerBias() + RestoreGPRs.GPROffset); + + // Do a second scan adding regs as being defined by instruction + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + unsigned Reg = CSI[I].getReg(); + if (Reg > RestoreGPRs.LowGPR && Reg < RestoreGPRs.HighGPR) + MIB.addReg(Reg, RegState::ImplicitDefine); + } + } + } + + return true; +} + void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const {} + MachineBasicBlock &MBB) const { + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); + SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + auto *ZII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); + auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + MachineInstr *StoreInstr = nullptr; + bool HasFP = hasFP(MF); + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + uint64_t Offset = 0; + + // TODO: Support leaf functions; only add size of save+reserved area when + // function is non-leaf. + MFFrame.setStackSize(MFFrame.getStackSize() + Regs.getCallFrameSize()); + uint64_t StackSize = MFFrame.getStackSize(); + + // FIXME: Implement support for large stack sizes, when the stack extension + // routine needs to be called. + if (StackSize > 1024 * 1024) { + llvm_unreachable("Huge Stack Frame not yet supported on z/OS"); + } + + if (ZFI->getSpillGPRRegs().LowGPR) { + // Skip over the GPR saves. + if ((MBBI != MBB.end()) && ((MBBI->getOpcode() == SystemZ::STMG))) { + const int Operand = 3; + // Now we can set the offset for the operation, since now the Stack + // has been finalized. + Offset = Regs.getStackPointerBias() + MBBI->getOperand(Operand).getImm(); + // Maximum displacement for STMG instruction. + if (isInt<20>(Offset - StackSize)) + Offset -= StackSize; + else + StoreInstr = &*MBBI; + MBBI->getOperand(Operand).setImm(Offset); + ++MBBI; + } else + llvm_unreachable("Couldn't skip over GPR saves"); + } + + if (StackSize) { + MachineBasicBlock::iterator InsertPt = StoreInstr ? StoreInstr : MBBI; + // Allocate StackSize bytes. + int64_t Delta = -int64_t(StackSize); + + // In case the STM(G) instruction also stores SP (R4), but the displacement + // is too large, the SP register is manipulated first before storing, + // resulting in the wrong value stored and retrieved later. In this case, we + // need to temporarily save the value of SP, and store it later to memory. + if (StoreInstr && HasFP) { + // Insert LR r0,r4 before STMG instruction. + BuildMI(MBB, InsertPt, DL, ZII->get(SystemZ::LGR)) + .addReg(SystemZ::R0D, RegState::Define) + .addReg(SystemZ::R4D); + // Insert ST r0,xxx(,r4) after STMG instruction. + BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG)) + .addReg(SystemZ::R0D, RegState::Kill) + .addReg(SystemZ::R4D) + .addImm(Offset) + .addReg(0); + } + + emitIncrement(MBB, InsertPt, DL, Regs.getStackPointerRegister(), Delta, + ZII); + } + + if (HasFP) { + // Copy the base of the frame to Frame Pointer Register. + BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR), + Regs.getFramePointerRegister()) + .addReg(Regs.getStackPointerRegister()); + + // Mark the FramePtr as live at the beginning of every block except + // the entry block. (We'll have marked R8 as live on entry when + // saving the GPRs.) + for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I) + I->addLiveIn(Regs.getFramePointerRegister()); + } +} void SystemZXPLINKFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const {} + MachineBasicBlock &MBB) const { + const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + auto *ZII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); + auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); + + // Skip the return instruction. + assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks"); + + uint64_t StackSize = MFFrame.getStackSize(); + if (StackSize) { + unsigned SPReg = Regs.getStackPointerRegister(); + if (ZFI->getRestoreGPRRegs().LowGPR != SPReg) { + DebugLoc DL = MBBI->getDebugLoc(); + emitIncrement(MBB, MBBI, DL, SPReg, StackSize, ZII); + } + } +} bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const { - return false; + return (MF.getFrameInfo().hasVarSizedObjects()); +} + +void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); + auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); + + // Setup stack frame offset + MFFrame.setOffsetAdjustment(Regs.getStackPointerBias()); } diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index af219da79c32..106b9e8ebe06 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -115,11 +115,20 @@ public: ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const override; + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBII, + MutableArrayRef<CalleeSavedInfo> CSI, + const TargetRegisterInfo *TRI) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; bool hasFP(const MachineFunction &MF) const override; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 71432218068e..24de52850771 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1500,8 +1500,16 @@ SDValue SystemZTargetLowering::LowerFormalArguments( assert(VA.isMemLoc() && "Argument not register or memory"); // Create the frame index object for this incoming parameter. - int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8, - VA.getLocMemOffset(), true); + // FIXME: Pre-include call frame size in the offset, should not + // need to manually add it here. + int64_t ArgSPOffset = VA.getLocMemOffset(); + if (Subtarget.isTargetXPLINK64()) { + auto &XPRegs = + Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); + ArgSPOffset += XPRegs.getCallFrameSize(); + } + int FI = + MFI.CreateFixedObject(LocVT.getSizeInBits() / 8, ArgSPOffset, true); // Create the SelectionDAG nodes corresponding to a load // from this parameter. Unpromoted ints and floats are @@ -5714,6 +5722,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(OC); OPCODE(XC); OPCODE(CLC); + OPCODE(MEMSET_MVC); OPCODE(STPCPY); OPCODE(STRCMP); OPCODE(SEARCH_STRING); @@ -7860,8 +7869,10 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI, return MBB; } -MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( - MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const { +MachineBasicBlock * +SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI, + MachineBasicBlock *MBB, + unsigned Opcode, bool IsMemset) const { MachineFunction &MF = *MBB->getParent(); const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); @@ -7870,18 +7881,64 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( MachineOperand DestBase = earlyUseOperand(MI.getOperand(0)); uint64_t DestDisp = MI.getOperand(1).getImm(); - MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2)); - uint64_t SrcDisp = MI.getOperand(3).getImm(); - MachineOperand &LengthMO = MI.getOperand(4); + MachineOperand SrcBase = MachineOperand::CreateReg(0U, false); + uint64_t SrcDisp; + + // Fold the displacement Disp if it is out of range. + auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void { + if (!isUInt<12>(Disp)) { + Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg) + .add(Base).addImm(Disp).addReg(0); + Base = MachineOperand::CreateReg(Reg, false); + Disp = 0; + } + }; + + if (!IsMemset) { + SrcBase = earlyUseOperand(MI.getOperand(2)); + SrcDisp = MI.getOperand(3).getImm(); + } else { + SrcBase = DestBase; + SrcDisp = DestDisp++; + foldDisplIfNeeded(DestBase, DestDisp); + } + + MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4); bool IsImmForm = LengthMO.isImm(); bool IsRegForm = !IsImmForm; + // Build and insert one Opcode of Length, with special treatment for memset. + auto insertMemMemOp = [&](MachineBasicBlock *InsMBB, + MachineBasicBlock::iterator InsPos, + MachineOperand DBase, uint64_t DDisp, + MachineOperand SBase, uint64_t SDisp, + unsigned Length) -> void { + assert(Length > 0 && Length <= 256 && "Building memory op with bad length."); + if (IsMemset) { + MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3)); + if (ByteMO.isImm()) + BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI)) + .add(SBase).addImm(SDisp).add(ByteMO); + else + BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC)) + .add(ByteMO).add(SBase).addImm(SDisp).addReg(0); + if (--Length == 0) + return; + } + BuildMI(*MBB, InsPos, DL, TII->get(Opcode)) + .add(DBase).addImm(DDisp).addImm(Length) + .add(SBase).addImm(SDisp) + .setMemRefs(MI.memoperands()); + }; + bool NeedsLoop = false; uint64_t ImmLength = 0; - Register LenMinus1Reg = SystemZ::NoRegister; + Register LenAdjReg = SystemZ::NoRegister; if (IsImmForm) { ImmLength = LengthMO.getImm(); - ImmLength++; // Add back the '1' subtracted originally. + ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment. if (ImmLength == 0) { MI.eraseFromParent(); return MBB; @@ -7905,7 +7962,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( NeedsLoop = true; } else { NeedsLoop = true; - LenMinus1Reg = LengthMO.getReg(); + LenAdjReg = LengthMO.getReg(); } // When generating more than one CLC, all but the last will need to @@ -7923,17 +7980,17 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( ImmLength &= 255; } else { BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg) - .addReg(LenMinus1Reg) + .addReg(LenAdjReg) .addReg(0) .addImm(8); } + bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); auto loadZeroAddress = [&]() -> MachineOperand { Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0); return MachineOperand::CreateReg(Reg, false); }; - bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister) DestBase = loadZeroAddress(); if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister) @@ -7968,14 +8025,41 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( DoneMBB = SystemZ::emitBlockAfter(NextMBB); // MBB: - // # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB. + // # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB. BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) - .addReg(LenMinus1Reg).addImm(-1); + .addReg(LenAdjReg).addImm(IsMemset ? -2 : -1); BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) .addMBB(AllDoneMBB); MBB->addSuccessor(AllDoneMBB); - MBB->addSuccessor(StartMBB); + if (!IsMemset) + MBB->addSuccessor(StartMBB); + else { + // MemsetOneCheckMBB: + // # Jump to MemsetOneMBB for a memset of length 1, or + // # fall thru to StartMBB. + MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB); + MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin()); + MBB->addSuccessor(MemsetOneCheckMBB); + MBB = MemsetOneCheckMBB; + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(LenAdjReg).addImm(-1); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(MemsetOneMBB); + MBB->addSuccessor(MemsetOneMBB, {10, 100}); + MBB->addSuccessor(StartMBB, {90, 100}); + + // MemsetOneMBB: + // # Jump back to AllDoneMBB after a single MVI or STC. + MBB = MemsetOneMBB; + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(StartDestReg, false), DestDisp, + MachineOperand::CreateReg(StartSrcReg, false), SrcDisp, + 1); + BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB); + MBB->addSuccessor(AllDoneMBB); + } // StartMBB: // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB. @@ -8032,10 +8116,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( if (Opcode == SystemZ::MVC) BuildMI(MBB, DL, TII->get(SystemZ::PFD)) .addImm(SystemZ::PFD_WRITE) - .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0); - BuildMI(MBB, DL, TII->get(Opcode)) - .addReg(ThisDestReg).addImm(DestDisp).addImm(256) - .addReg(ThisSrcReg).addImm(SrcDisp); + .addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0); + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(ThisDestReg, false), DestDisp, + MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256); if (EndMBB) { BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) @@ -8075,7 +8159,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run. // # Use EXecute Relative Long for the remainder of the bytes. The target // instruction of the EXRL will have a length field of 1 since 0 is an - // illegal value. The number of bytes processed becomes (%LenMinus1Reg & + // illegal value. The number of bytes processed becomes (%LenAdjReg & // 0xff) + 1. // # Fall through to AllDoneMBB. Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); @@ -8088,10 +8172,14 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg) .addReg(StartSrcReg).addMBB(StartMBB) .addReg(NextSrcReg).addMBB(NextMBB); + if (IsMemset) + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(RemDestReg, false), DestDisp, + MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1); MachineInstrBuilder EXRL_MIB = BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) .addImm(Opcode) - .addReg(LenMinus1Reg) + .addReg(LenAdjReg) .addReg(RemDestReg).addImm(DestDisp) .addReg(RemSrcReg).addImm(SrcDisp); MBB->addSuccessor(AllDoneMBB); @@ -8107,32 +8195,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( while (ImmLength > 0) { uint64_t ThisLength = std::min(ImmLength, uint64_t(256)); // The previous iteration might have created out-of-range displacements. - // Apply them using LAY if so. - if (!isUInt<12>(DestDisp)) { - Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); - BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg) - .add(DestBase) - .addImm(DestDisp) - .addReg(0); - DestBase = MachineOperand::CreateReg(Reg, false); - DestDisp = 0; - } - if (!isUInt<12>(SrcDisp)) { - Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); - BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg) - .add(SrcBase) - .addImm(SrcDisp) - .addReg(0); - SrcBase = MachineOperand::CreateReg(Reg, false); - SrcDisp = 0; - } - BuildMI(*MBB, MI, DL, TII->get(Opcode)) - .add(DestBase) - .addImm(DestDisp) - .addImm(ThisLength) - .add(SrcBase) - .addImm(SrcDisp) - .setMemRefs(MI.memoperands()); + // Apply them using LA/LAY if so. + foldDisplIfNeeded(DestBase, DestDisp); + foldDisplIfNeeded(SrcBase, SrcDisp); + insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength); DestDisp += ThisLength; SrcDisp += ThisLength; ImmLength -= ThisLength; @@ -8630,6 +8696,11 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( case SystemZ::CLCImm: case SystemZ::CLCReg: return emitMemMemWrapper(MI, MBB, SystemZ::CLC); + case SystemZ::MemsetImmImm: + case SystemZ::MemsetImmReg: + case SystemZ::MemsetRegImm: + case SystemZ::MemsetRegReg: + return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/); case SystemZ::CLSTLoop: return emitStringWrapper(MI, MBB, SystemZ::CLST); case SystemZ::MVSTLoop: diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 461f804ca55e..940c0a857ea4 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -126,6 +126,9 @@ enum NodeType : unsigned { // as for MVC. CLC, + // Use MVC to set a block of memory after storing the first byte. + MEMSET_MVC, + // Use an MVST-based sequence to implement stpcpy(). STPCPY, @@ -709,7 +712,8 @@ private: MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB, - unsigned Opcode) const; + unsigned Opcode, + bool IsMemset = false) const; MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) const; MachineBasicBlock *emitTransactionBegin(MachineInstr &MI, diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td index cd60fff1ab11..e513befd0d6f 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5256,6 +5256,16 @@ class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2> let Constraints = "$R1 = $R1src"; } +class MemsetPseudo<DAGOperand lenop, DAGOperand byteop> + : Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B), + [(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> { + let Defs = [CC]; + let mayLoad = 1; + let mayStore = 1; + let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; +} + //===----------------------------------------------------------------------===// // Multiclasses that emit both real and pseudo instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td index e4760229fd6b..84f1e0fb428c 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -510,6 +510,12 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in { def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>; } +// Memset[Length][Byte] pseudos. +def MemsetImmImm : MemsetPseudo<imm64, imm32zx8trunc>; +def MemsetImmReg : MemsetPseudo<imm64, GR32>; +def MemsetRegImm : MemsetPseudo<ADDR64, imm32zx8trunc>; +def MemsetRegReg : MemsetPseudo<ADDR64, GR32>; + // Move right. let Predicates = [FeatureMiscellaneousExtensions3], mayLoad = 1, mayStore = 1, Uses = [R0L] in diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td index 927d97233286..9935416559bc 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -102,6 +102,10 @@ def SDT_ZMemMemLengthCC : SDTypeProfile<1, 3, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVT<3, i64>]>; +def SDT_ZMemsetMVC : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, + SDTCisVT<1, i64>, + SDTCisVT<2, i32>]>; def SDT_ZString : SDTypeProfile<1, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, @@ -413,6 +417,8 @@ def z_xc : SDNode<"SystemZISD::XC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC, [SDNPHasChain, SDNPMayLoad]>; +def z_memset_mvc : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC, [SDNPHasChain, SDNPMayLoad]>; def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString, diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp index f38e93109967..db4b4879b33a 100644 --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -17,29 +17,44 @@ using namespace llvm; #define DEBUG_TYPE "systemz-selectiondag-info" -static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) { - return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other) - : DAG.getVTList(MVT::Other); +static unsigned getMemMemLenAdj(unsigned Op) { + return Op == SystemZISD::MEMSET_MVC ? 2 : 1; } -// Emit a mem-mem operation after subtracting one from size, which will be -// added back during pseudo expansion. As the Reg case emitted here may be -// converted by DAGCombiner into having an Imm length, they are both emitted -// the same way. +static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue LenAdj, SDValue Byte) { + SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other) + : DAG.getVTList(MVT::Other); + SmallVector<SDValue, 6> Ops; + if (Op == SystemZISD::MEMSET_MVC) + Ops = { Chain, Dst, LenAdj, Byte }; + else + Ops = { Chain, Dst, Src, LenAdj }; + return DAG.getNode(Op, DL, VTs, Ops); +} + +// Emit a mem-mem operation after subtracting one (or two for memset) from +// size, which will be added back during pseudo expansion. As the Reg case +// emitted here may be converted by DAGCombiner into having an Imm length, +// they are both emitted the same way. static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, SDValue Chain, SDValue Dst, SDValue Src, - uint64_t Size) { - return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, - DAG.getConstant(Size - 1, DL, Src.getValueType())); + uint64_t Size, SDValue Byte = SDValue()) { + unsigned Adj = getMemMemLenAdj(Op); + assert(Size >= Adj && "Adjusted length overflow."); + SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType()); + return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte); } static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size) { - SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64, - DAG.getZExtOrTrunc(Size, DL, MVT::i64), - DAG.getConstant(-1, DL, MVT::i64)); - return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1); + SDValue Size, SDValue Byte = SDValue()) { + int64_t Adj = getMemMemLenAdj(Op); + SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64, + DAG.getZExtOrTrunc(Size, DL, MVT::i64), + DAG.getConstant(0 - Adj, DL, MVT::i64)); + return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte); } SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy( @@ -127,13 +142,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset( if (CByte && CByte->getZExtValue() == 0) return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes); - // Copy the byte to the first location and then use MVC to copy - // it to the rest. - Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment); - SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, - DAG.getConstant(1, DL, PtrVT)); - return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst, - Bytes - 1); + return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(), + Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32)); } // Variable length @@ -141,7 +151,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset( // Handle the special case of a variable length memset of 0 with XC. return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size); - return SDValue(); + return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(), + Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32)); } // Convert the current CC value into an integer that is 0 if CC == 0, diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp index 7e92e4b33812..fd9dc32b04f5 100644 --- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp +++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp @@ -84,6 +84,8 @@ class VEAsmParser : public MCTargetAsmParser { StringRef splitMnemonic(StringRef Name, SMLoc NameLoc, OperandVector *Operands); + bool parseLiteralValues(unsigned Size, SMLoc L); + public: VEAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, const MCInstrInfo &MII, const MCTargetOptions &Options) @@ -994,10 +996,43 @@ bool VEAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } bool VEAsmParser::ParseDirective(AsmToken DirectiveID) { + std::string IDVal = DirectiveID.getIdentifier().lower(); + + // Defines VE specific directives. Reference is "Vector Engine Assembly + // Language Reference Manual": + // https://www.hpc.nec/documents/sdk/pdfs/VectorEngine-as-manual-v1.3.pdf + + // The .word is 4 bytes long on VE. + if (IDVal == ".word") + return parseLiteralValues(4, DirectiveID.getLoc()); + + // The .long is 8 bytes long on VE. + if (IDVal == ".long") + return parseLiteralValues(8, DirectiveID.getLoc()); + + // The .llong is 8 bytes long on VE. + if (IDVal == ".llong") + return parseLiteralValues(8, DirectiveID.getLoc()); + // Let the MC layer to handle other directives. return true; } +/// parseLiteralValues +/// ::= .word expression [, expression]* +/// ::= .long expression [, expression]* +/// ::= .llong expression [, expression]* +bool VEAsmParser::parseLiteralValues(unsigned Size, SMLoc L) { + auto parseOne = [&]() -> bool { + const MCExpr *Value; + if (getParser().parseExpression(Value)) + return true; + getParser().getStreamer().emitValue(Value, Size, L); + return false; + }; + return (parseMany(parseOne)); +} + /// Extract \code @lo32/@hi32/etc \endcode modifier from expression. /// Recursively scan the expression and check for VK_VE_HI32/LO32/etc /// symbol variants. If all symbols with modifier use the same diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp index 29c209934680..38d163b37080 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp @@ -42,6 +42,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { case VE::fixup_ve_tpoff_hi32: return (Value >> 32) & 0xffffffff; case VE::fixup_ve_reflong: + case VE::fixup_ve_srel32: case VE::fixup_ve_lo32: case VE::fixup_ve_pc_lo32: case VE::fixup_ve_got_lo32: @@ -68,6 +69,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case FK_Data_4: case FK_PCRel_4: case VE::fixup_ve_reflong: + case VE::fixup_ve_srel32: case VE::fixup_ve_hi32: case VE::fixup_ve_lo32: case VE::fixup_ve_pc_hi32: @@ -103,6 +105,7 @@ public: const static MCFixupKindInfo Infos[VE::NumTargetFixupKinds] = { // name, offset, bits, flags {"fixup_ve_reflong", 0, 32, 0}, + {"fixup_ve_srel32", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, {"fixup_ve_hi32", 0, 32, 0}, {"fixup_ve_lo32", 0, 32, 0}, {"fixup_ve_pc_hi32", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp index 741e8320a941..ae065407409a 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp @@ -9,6 +9,7 @@ #include "VEFixupKinds.h" #include "VEMCExpr.h" #include "VEMCTargetDesc.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCObjectWriter.h" @@ -46,16 +47,29 @@ unsigned VEELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, if (IsPCRel) { switch (Fixup.getTargetKind()) { default: - llvm_unreachable("Unimplemented fixup -> relocation"); + Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind"); + return ELF::R_VE_NONE; + case FK_Data_1: case FK_PCRel_1: - llvm_unreachable("Unimplemented fixup fk_data_1 -> relocation"); + Ctx.reportError(Fixup.getLoc(), + "1-byte pc-relative data relocation is not supported"); + return ELF::R_VE_NONE; + case FK_Data_2: case FK_PCRel_2: - llvm_unreachable("Unimplemented fixup fk_data_2 -> relocation"); - // FIXME: relative kind? + Ctx.reportError(Fixup.getLoc(), + "2-byte pc-relative data relocation is not supported"); + return ELF::R_VE_NONE; + case FK_Data_4: case FK_PCRel_4: - return ELF::R_VE_REFLONG; + return ELF::R_VE_SREL32; + case FK_Data_8: case FK_PCRel_8: - return ELF::R_VE_REFQUAD; + Ctx.reportError(Fixup.getLoc(), + "8-byte pc-relative data relocation is not supported"); + return ELF::R_VE_NONE; + case VE::fixup_ve_reflong: + case VE::fixup_ve_srel32: + return ELF::R_VE_SREL32; case VE::fixup_ve_pc_hi32: return ELF::R_VE_PC_HI32; case VE::fixup_ve_pc_lo32: @@ -65,25 +79,36 @@ unsigned VEELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, switch (Fixup.getTargetKind()) { default: - llvm_unreachable("Unimplemented fixup -> relocation"); + Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type"); + return ELF::R_VE_NONE; case FK_Data_1: - llvm_unreachable("Unimplemented fixup fk_data_1 -> relocation"); + Ctx.reportError(Fixup.getLoc(), "1-byte data relocation is not supported"); + return ELF::R_VE_NONE; case FK_Data_2: - llvm_unreachable("Unimplemented fixup fk_data_2 -> relocation"); + Ctx.reportError(Fixup.getLoc(), "2-byte data relocation is not supported"); + return ELF::R_VE_NONE; case FK_Data_4: return ELF::R_VE_REFLONG; case FK_Data_8: return ELF::R_VE_REFQUAD; case VE::fixup_ve_reflong: return ELF::R_VE_REFLONG; + case VE::fixup_ve_srel32: + Ctx.reportError(Fixup.getLoc(), + "A non pc-relative srel32 relocation is not supported"); + return ELF::R_VE_NONE; case VE::fixup_ve_hi32: return ELF::R_VE_HI32; case VE::fixup_ve_lo32: return ELF::R_VE_LO32; case VE::fixup_ve_pc_hi32: - llvm_unreachable("Unimplemented fixup pc_hi32 -> relocation"); + Ctx.reportError(Fixup.getLoc(), + "A non pc-relative pc_hi32 relocation is not supported"); + return ELF::R_VE_NONE; case VE::fixup_ve_pc_lo32: - llvm_unreachable("Unimplemented fixup pc_lo32 -> relocation"); + Ctx.reportError(Fixup.getLoc(), + "A non pc-relative pc_lo32 relocation is not supported"); + return ELF::R_VE_NONE; case VE::fixup_ve_got_hi32: return ELF::R_VE_GOT_HI32; case VE::fixup_ve_got_lo32: diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h index 5d5dc1c5c891..46b995cee840 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h +++ b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h @@ -17,6 +17,9 @@ enum Fixups { /// fixup_ve_reflong - 32-bit fixup corresponding to foo fixup_ve_reflong = FirstTargetFixupKind, + /// fixup_ve_srel32 - 32-bit fixup corresponding to foo for relative branch + fixup_ve_srel32, + /// fixup_ve_hi32 - 32-bit fixup corresponding to foo@hi fixup_ve_hi32, diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp index d50d8fcae9da..65bb0cf8b0d7 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp @@ -102,11 +102,11 @@ unsigned VEMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCSubtargetInfo &STI) const { if (MO.isReg()) return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); - if (MO.isImm()) - return MO.getImm(); + return static_cast<unsigned>(MO.getImm()); assert(MO.isExpr()); + const MCExpr *Expr = MO.getExpr(); if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Expr)) { MCFixupKind Kind = (MCFixupKind)SExpr->getFixupKind(); @@ -131,7 +131,7 @@ VEMCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpNo, return getMachineOpValue(MI, MO, Fixups, STI); Fixups.push_back( - MCFixup::create(0, MO.getExpr(), (MCFixupKind)VE::fixup_ve_pc_lo32)); + MCFixup::create(0, MO.getExpr(), (MCFixupKind)VE::fixup_ve_srel32)); return 0; } diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp index a3ce3b3309be..4d45918ad0aa 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp @@ -12,11 +12,12 @@ //===----------------------------------------------------------------------===// #include "VEMCExpr.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCValue.h" using namespace llvm; @@ -174,7 +175,13 @@ VE::Fixups VEMCExpr::getFixupKind(VEMCExpr::VariantKind Kind) { bool VEMCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const { - return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup); + if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup)) + return false; + + Res = + MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind()); + + return true; } static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) { diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp index 32315543826a..5ef223d6030b 100644 --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -1720,7 +1720,7 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); -#define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME: +#define ADD_BINARY_VVP_OP(VVP_NAME, VP_NAME, ISD_NAME) case ISD::ISD_NAME: #include "VVPNodes.def" return lowerToVVP(Op, DAG); } diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td index 2c88d5099a7b..99566e91ec11 100644 --- a/llvm/lib/Target/VE/VVPInstrInfo.td +++ b/llvm/lib/Target/VE/VVPInstrInfo.td @@ -29,6 +29,16 @@ def SDTIntBinOpVVP : SDTypeProfile<1, 4, [ // vp_add, vp_and, etc. IsVLVT<4> ]>; +// BinaryFPOp(x,y,mask,vl) +def SDTFPBinOpVVP : SDTypeProfile<1, 4, [ // vvp_fadd, etc. + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisFP<0>, + SDTCisInt<3>, + SDTCisSameNumEltsAs<0, 3>, + IsVLVT<4> +]>; + // Binary operator commutative pattern. class vvp_commutative<SDNode RootOp> : PatFrags< @@ -40,7 +50,32 @@ class vvp_commutative<SDNode RootOp> : def vvp_add : SDNode<"VEISD::VVP_ADD", SDTIntBinOpVVP>; def c_vvp_add : vvp_commutative<vvp_add>; +def vvp_sub : SDNode<"VEISD::VVP_SUB", SDTIntBinOpVVP>; + +def vvp_mul : SDNode<"VEISD::VVP_MUL", SDTIntBinOpVVP>; +def c_vvp_mul : vvp_commutative<vvp_mul>; + +def vvp_sdiv : SDNode<"VEISD::VVP_SDIV", SDTIntBinOpVVP>; +def vvp_udiv : SDNode<"VEISD::VVP_UDIV", SDTIntBinOpVVP>; + def vvp_and : SDNode<"VEISD::VVP_AND", SDTIntBinOpVVP>; def c_vvp_and : vvp_commutative<vvp_and>; +def vvp_or : SDNode<"VEISD::VVP_OR", SDTIntBinOpVVP>; +def c_vvp_or : vvp_commutative<vvp_or>; + +def vvp_xor : SDNode<"VEISD::VVP_XOR", SDTIntBinOpVVP>; +def c_vvp_xor : vvp_commutative<vvp_xor>; + +def vvp_srl : SDNode<"VEISD::VVP_SRL", SDTIntBinOpVVP>; +def vvp_sra : SDNode<"VEISD::VVP_SRA", SDTIntBinOpVVP>; +def vvp_shl : SDNode<"VEISD::VVP_SHL", SDTIntBinOpVVP>; + +def vvp_fadd : SDNode<"VEISD::VVP_FADD", SDTFPBinOpVVP>; +def c_vvp_fadd : vvp_commutative<vvp_fadd>; +def vvp_fsub : SDNode<"VEISD::VVP_FSUB", SDTFPBinOpVVP>; +def vvp_fmul : SDNode<"VEISD::VVP_FMUL", SDTFPBinOpVVP>; +def c_vvp_fmul : vvp_commutative<vvp_fmul>; +def vvp_fdiv : SDNode<"VEISD::VVP_FDIV", SDTFPBinOpVVP>; + // } Binary Operators diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td index ac03e0bf627e..8d5d9d103547 100644 --- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td @@ -17,54 +17,177 @@ //===----------------------------------------------------------------------===// include "VVPInstrInfo.td" -multiclass VectorBinaryArith< - SDPatternOperator OpNode, - ValueType ScalarVT, ValueType DataVT, ValueType MaskVT, - string OpBaseName> { - // No mask. +multiclass Binary_rv<SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, + ValueType MaskVT, string OpBaseName> { + // Masked with select, broadcast. + // TODO + + // Unmasked, broadcast. def : Pat<(OpNode - (any_broadcast ScalarVT:$sx), - DataVT:$vy, (MaskVT true_mask), i32:$avl), + (any_broadcast ScalarVT:$sx), DataVT:$vy, + (MaskVT true_mask), + i32:$avl), (!cast<Instruction>(OpBaseName#"rvl") ScalarVT:$sx, $vy, $avl)>; - def : Pat<(OpNode DataVT:$vx, DataVT:$vy, (MaskVT true_mask), i32:$avl), + // Masked, broadcast. + def : Pat<(OpNode + (any_broadcast ScalarVT:$sx), DataVT:$vy, + MaskVT:$mask, + i32:$avl), + (!cast<Instruction>(OpBaseName#"rvml") + ScalarVT:$sx, $vy, $mask, $avl)>; +} + +multiclass Binary_vr<SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, + ValueType MaskVT, string OpBaseName> { + // Masked with select, broadcast. + // TODO + + // Unmasked, broadcast. + def : Pat<(OpNode + DataVT:$vx, (any_broadcast ScalarVT:$sy), + (MaskVT true_mask), + i32:$avl), + (!cast<Instruction>(OpBaseName#"vrl") + $vx, ScalarVT:$sy, $avl)>; + // Masked, broadcast. + def : Pat<(OpNode + DataVT:$vx, (any_broadcast ScalarVT:$sy), + MaskVT:$mask, + i32:$avl), + (!cast<Instruction>(OpBaseName#"vrml") + $vx, ScalarVT:$sy, $mask, $avl)>; +} + +multiclass Binary_vv<SDPatternOperator OpNode, + ValueType DataVT, + ValueType MaskVT, string OpBaseName> { + // Masked with select. + // TODO + + // Unmasked. + def : Pat<(OpNode + DataVT:$vx, DataVT:$vy, + (MaskVT true_mask), + i32:$avl), (!cast<Instruction>(OpBaseName#"vvl") $vx, $vy, $avl)>; - // Mask. + // Masked. def : Pat<(OpNode - (any_broadcast ScalarVT:$sx), - DataVT:$vy, MaskVT:$mask, i32:$avl), - (!cast<Instruction>(OpBaseName#"rvml") - ScalarVT:$sx, $vy, $mask, $avl)>; - def : Pat<(OpNode DataVT:$vx, DataVT:$vy, MaskVT:$mask, i32:$avl), + DataVT:$vx, DataVT:$vy, + MaskVT:$mask, + i32:$avl), (!cast<Instruction>(OpBaseName#"vvml") $vx, $vy, $mask, $avl)>; +} - // TODO We do not specify patterns for the immediate variants here. There - // will be an immediate folding pass that takes care of switching to the - // immediate variant where applicable. +multiclass Binary_rv_vv< + SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, ValueType MaskVT, + string OpBaseName> { + defm : Binary_rv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>; + defm : Binary_vv<OpNode, DataVT, MaskVT, OpBaseName>; +} + +multiclass Binary_vr_vv< + SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, ValueType MaskVT, + string OpBaseName> { + defm : Binary_vr<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>; + defm : Binary_vv<OpNode, DataVT, MaskVT, OpBaseName>; +} - // TODO Fold vvp_select into passthru. +multiclass Binary_rv_vr_vv< + SDPatternOperator OpNode, + ValueType ScalarVT, ValueType DataVT, ValueType MaskVT, + string OpBaseName> { + defm : Binary_rv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>; + defm : Binary_vr_vv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>; } // Expand both 64bit and 32 bit variant (256 elements) -multiclass VectorBinaryArith_ShortLong< +multiclass Binary_rv_vv_ShortLong< + SDPatternOperator OpNode, + ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName, + ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> { + defm : Binary_rv_vv<OpNode, + LongScalarVT, LongDataVT, v256i1, + LongOpBaseName>; + defm : Binary_rv_vv<OpNode, + ShortScalarVT, ShortDataVT, v256i1, + ShortOpBaseName>; +} + +multiclass Binary_vr_vv_ShortLong< + SDPatternOperator OpNode, + ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName, + ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> { + defm : Binary_vr_vv<OpNode, + LongScalarVT, LongDataVT, v256i1, + LongOpBaseName>; + defm : Binary_vr_vv<OpNode, + ShortScalarVT, ShortDataVT, v256i1, + ShortOpBaseName>; +} + +multiclass Binary_rv_vr_vv_ShortLong< SDPatternOperator OpNode, ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName, ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> { - defm : VectorBinaryArith<OpNode, - LongScalarVT, LongDataVT, v256i1, - LongOpBaseName>; - defm : VectorBinaryArith<OpNode, - ShortScalarVT, ShortDataVT, v256i1, - ShortOpBaseName>; + defm : Binary_rv_vr_vv<OpNode, + LongScalarVT, LongDataVT, v256i1, + LongOpBaseName>; + defm : Binary_rv_vr_vv<OpNode, + ShortScalarVT, ShortDataVT, v256i1, + ShortOpBaseName>; } +defm : Binary_rv_vv_ShortLong<c_vvp_add, + i64, v256i64, "VADDSL", + i32, v256i32, "VADDSWSX">; +defm : Binary_rv_vv_ShortLong<vvp_sub, + i64, v256i64, "VSUBSL", + i32, v256i32, "VSUBSWSX">; +defm : Binary_rv_vv_ShortLong<c_vvp_mul, + i64, v256i64, "VMULSL", + i32, v256i32, "VMULSWSX">; +defm : Binary_rv_vr_vv_ShortLong<vvp_sdiv, + i64, v256i64, "VDIVSL", + i32, v256i32, "VDIVSWSX">; +defm : Binary_rv_vr_vv_ShortLong<vvp_udiv, + i64, v256i64, "VDIVUL", + i32, v256i32, "VDIVUW">; +defm : Binary_rv_vv_ShortLong<c_vvp_and, + i64, v256i64, "VAND", + i32, v256i32, "PVANDLO">; +defm : Binary_rv_vv_ShortLong<c_vvp_or, + i64, v256i64, "VOR", + i32, v256i32, "PVORLO">; +defm : Binary_rv_vv_ShortLong<c_vvp_xor, + i64, v256i64, "VXOR", + i32, v256i32, "PVXORLO">; +defm : Binary_vr_vv_ShortLong<vvp_shl, + i64, v256i64, "VSLL", + i32, v256i32, "PVSLLLO">; +defm : Binary_vr_vv_ShortLong<vvp_sra, + i64, v256i64, "VSRAL", + i32, v256i32, "PVSRALO">; +defm : Binary_vr_vv_ShortLong<vvp_srl, + i64, v256i64, "VSRL", + i32, v256i32, "PVSRLLO">; -defm : VectorBinaryArith_ShortLong<c_vvp_add, - i64, v256i64, "VADDSL", - i32, v256i32, "VADDSWSX">; -defm : VectorBinaryArith_ShortLong<c_vvp_and, - i64, v256i64, "VAND", - i32, v256i32, "PVANDLO">; +defm : Binary_rv_vv_ShortLong<c_vvp_fadd, + f64, v256f64, "VFADDD", + f32, v256f32, "PVFADDUP">; +defm : Binary_rv_vv_ShortLong<c_vvp_fmul, + f64, v256f64, "VFMULD", + f32, v256f32, "PVFMULUP">; +defm : Binary_rv_vv_ShortLong<vvp_fsub, + f64, v256f64, "VFSUBD", + f32, v256f32, "PVFSUBUP">; +defm : Binary_rv_vr_vv_ShortLong<vvp_fdiv, + f64, v256f64, "VFDIVD", + f32, v256f32, "VFDIVS">; diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def index a68402e9ea10..8a9231f7d3e6 100644 --- a/llvm/lib/Target/VE/VVPNodes.def +++ b/llvm/lib/Target/VE/VVPNodes.def @@ -28,14 +28,38 @@ /// \p VVPName is a VVP Binary operator. /// \p SDNAME is the generic SD opcode corresponding to \p VVPName. #ifndef ADD_BINARY_VVP_OP -#define ADD_BINARY_VVP_OP(X,Y) ADD_VVP_OP(X,Y) HANDLE_VP_TO_VVP(VP_##Y, X) +#define ADD_BINARY_VVP_OP(VVPNAME,VPNAME,SDNAME) \ + ADD_VVP_OP(VVPNAME,SDNAME) \ + HANDLE_VP_TO_VVP(VPNAME, VVPNAME) +#endif + +#ifndef ADD_BINARY_VVP_OP_COMPACT +#define ADD_BINARY_VVP_OP_COMPACT(NAME) \ + ADD_BINARY_VVP_OP(VVP_##NAME,VP_##NAME,NAME) #endif // Integer arithmetic. -ADD_BINARY_VVP_OP(VVP_ADD,ADD) +ADD_BINARY_VVP_OP_COMPACT(ADD) +ADD_BINARY_VVP_OP_COMPACT(SUB) +ADD_BINARY_VVP_OP_COMPACT(MUL) +ADD_BINARY_VVP_OP_COMPACT(UDIV) +ADD_BINARY_VVP_OP_COMPACT(SDIV) -ADD_BINARY_VVP_OP(VVP_AND,AND) +ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA) +ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL) +ADD_BINARY_VVP_OP_COMPACT(SHL) + +ADD_BINARY_VVP_OP_COMPACT(AND) +ADD_BINARY_VVP_OP_COMPACT(OR) +ADD_BINARY_VVP_OP_COMPACT(XOR) + +// FP arithmetic. +ADD_BINARY_VVP_OP_COMPACT(FADD) +ADD_BINARY_VVP_OP_COMPACT(FSUB) +ADD_BINARY_VVP_OP_COMPACT(FMUL) +ADD_BINARY_VVP_OP_COMPACT(FDIV) -#undef HANDLE_VP_TO_VVP #undef ADD_BINARY_VVP_OP +#undef ADD_BINARY_VVP_OP_COMPACT #undef ADD_VVP_OP +#undef HANDLE_VP_TO_VVP diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 7d1e6c553f81..56689d3ee06b 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -571,7 +571,6 @@ public: // proper nesting. bool ExpectBlockType = false; bool ExpectFuncType = false; - bool ExpectHeapType = false; std::unique_ptr<WebAssemblyOperand> FunctionTable; if (Name == "block") { push(Block); @@ -624,8 +623,6 @@ public: if (parseFunctionTableOperand(&FunctionTable)) return true; ExpectFuncType = true; - } else if (Name == "ref.null") { - ExpectHeapType = true; } if (ExpectFuncType || (ExpectBlockType && Lexer.is(AsmToken::LParen))) { @@ -670,15 +667,6 @@ public: return error("Unknown block type: ", Id); addBlockTypeOperand(Operands, NameLoc, BT); Parser.Lex(); - } else if (ExpectHeapType) { - auto HeapType = WebAssembly::parseHeapType(Id.getString()); - if (HeapType == WebAssembly::HeapType::Invalid) { - return error("Expected a heap type: ", Id); - } - Operands.push_back(std::make_unique<WebAssemblyOperand>( - WebAssemblyOperand::Integer, Id.getLoc(), Id.getEndLoc(), - WebAssemblyOperand::IntOp{static_cast<int64_t>(HeapType)})); - Parser.Lex(); } else { // Assume this identifier is a label. const MCExpr *Val; diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp index a6b5d4252f2f..128ce5c4fec0 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp @@ -112,9 +112,18 @@ bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCInst &Inst, return false; } -bool WebAssemblyAsmTypeCheck::checkEnd(SMLoc ErrorLoc) { +bool WebAssemblyAsmTypeCheck::checkEnd(SMLoc ErrorLoc, bool PopVals) { if (LastSig.Returns.size() > Stack.size()) return typeError(ErrorLoc, "end: insufficient values on the type stack"); + + if (PopVals) { + for (auto VT : llvm::reverse(LastSig.Returns)) { + if (popType(ErrorLoc, VT)) + return true; + } + return false; + } + for (size_t i = 0; i < LastSig.Returns.size(); i++) { auto EVT = LastSig.Returns[i]; auto PVT = Stack[Stack.size() - LastSig.Returns.size() + i]; @@ -221,7 +230,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst) { return true; } else if (Name == "end_block" || Name == "end_loop" || Name == "end_if" || Name == "else" || Name == "end_try") { - if (checkEnd(ErrorLoc)) + if (checkEnd(ErrorLoc, Name == "else")) return true; if (Name == "end_block") Unreachable = false; diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h index aa35213ccca3..2b07faf67a18 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h @@ -39,7 +39,7 @@ class WebAssemblyAsmTypeCheck final { bool typeError(SMLoc ErrorLoc, const Twine &Msg); bool popType(SMLoc ErrorLoc, Optional<wasm::ValType> EVT); bool getLocal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type); - bool checkEnd(SMLoc ErrorLoc); + bool checkEnd(SMLoc ErrorLoc, bool PopVals = false); bool checkSig(SMLoc ErrorLoc, const wasm::WasmSignature &Sig); bool getSymRef(SMLoc ErrorLoc, const MCInst &Inst, const MCSymbolRefExpr *&SymRef); diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp index 2e1e4f061219..5d38145559da 100644 --- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp +++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -241,28 +241,6 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction( } break; } - // heap_type operands, for e.g. ref.null: - case WebAssembly::OPERAND_HEAPTYPE: { - int64_t Val; - uint64_t PrevSize = Size; - if (!nextLEB(Val, Bytes, Size, true)) - return MCDisassembler::Fail; - if (Val < 0 && Size == PrevSize + 1) { - // The HeapType encoding is like BlockType, in that encodings that - // decode as negative values indicate ValTypes. In practice we expect - // either wasm::ValType::EXTERNREF or wasm::ValType::FUNCREF here. - // - // The positive SLEB values are reserved for future expansion and are - // expected to be type indices in the typed function references - // proposal, and should disassemble as MCSymbolRefExpr as in BlockType - // above. - MI.addOperand(MCOperand::createImm(Val & 0x7f)); - } else { - MI.addOperand( - MCOperand::createImm(int64_t(WebAssembly::HeapType::Invalid))); - } - break; - } // FP operands. case WebAssembly::OPERAND_F32IMM: { if (!parseImmediate<float>(MI, Size, Bytes)) diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp index 2967aaa00ad4..d72bfdbbfb99 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp @@ -366,26 +366,3 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI, } } } - -void WebAssemblyInstPrinter::printWebAssemblyHeapTypeOperand(const MCInst *MI, - unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isImm()) { - switch (Op.getImm()) { - case long(wasm::ValType::EXTERNREF): - O << "extern"; - break; - case long(wasm::ValType::FUNCREF): - O << "func"; - break; - default: - O << "unsupported_heap_type_value"; - break; - } - } else { - // Typed function references and other subtypes of funcref and externref - // currently unimplemented. - O << "unsupported_heap_type_operand"; - } -} diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h index 7d980c78c3c9..fe104cbca12e 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h @@ -47,8 +47,6 @@ public: raw_ostream &O); void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printWebAssemblyHeapTypeOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O); // Autogenerated by tblgen. std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp index c3d259e6ff20..d8122950e061 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "WebAssemblyMCAsmInfo.h" +#include "Utils/WebAssemblyUtilities.h" #include "llvm/ADT/Triple.h" using namespace llvm; @@ -44,5 +45,13 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T, SupportsDebugInformation = true; + // When compilation is done on a cpp file by clang, the exception model info + // is stored in LangOptions, which is later used to set the info in + // TargetOptions and then MCAsmInfo in LLVMTargetMachine::initAsmInfo(). But + // this process does not happen when compiling bitcode directly with clang, so + // we make sure this info is set correctly. + if (WebAssembly::WasmEnableEH || WebAssembly::WasmEnableSjLj) + ExceptionsType = ExceptionHandling::Wasm; + // TODO: UseIntegratedAssembler? } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp index 4961c2ef9529..6e494b9430f7 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -106,9 +106,6 @@ void WebAssemblyMCCodeEmitter::encodeInstruction( encodeSLEB128(int64_t(MO.getImm()), OS); break; case WebAssembly::OPERAND_SIGNATURE: - case WebAssembly::OPERAND_HEAPTYPE: - OS << uint8_t(MO.getImm()); - break; case WebAssembly::OPERAND_VEC_I8IMM: support::endian::write<uint8_t>(OS, MO.getImm(), support::little); break; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index d07bfce9abc1..b2f10ca93a4f 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -78,8 +78,6 @@ enum OperandType { OPERAND_BRLIST, /// 32-bit unsigned table number. OPERAND_TABLE, - /// heap type immediate for ref.null. - OPERAND_HEAPTYPE, }; } // end namespace WebAssembly diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp index 6f81431bba2d..0412e524f800 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp @@ -41,13 +41,6 @@ Optional<wasm::ValType> WebAssembly::parseType(StringRef Type) { return Optional<wasm::ValType>(); } -WebAssembly::HeapType WebAssembly::parseHeapType(StringRef Type) { - return StringSwitch<WebAssembly::HeapType>(Type) - .Case("extern", WebAssembly::HeapType::Externref) - .Case("func", WebAssembly::HeapType::Funcref) - .Default(WebAssembly::HeapType::Invalid); -} - WebAssembly::BlockType WebAssembly::parseBlockType(StringRef Type) { // Multivalue block types are handled separately in parseSignature return StringSwitch<WebAssembly::BlockType>(Type) diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h index 8d757df27b34..042d51c7d6cb 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.h @@ -41,17 +41,9 @@ enum class BlockType : unsigned { Multivalue = 0xffff, }; -/// Used as immediate MachineOperands for heap types, e.g. for ref.null. -enum class HeapType : unsigned { - Invalid = 0x00, - Externref = unsigned(wasm::ValType::EXTERNREF), - Funcref = unsigned(wasm::ValType::FUNCREF), -}; - // Convert StringRef to ValType / HealType / BlockType Optional<wasm::ValType> parseType(StringRef Type); -HeapType parseHeapType(StringRef Type); BlockType parseBlockType(StringRef Type); MVT parseMVT(StringRef Type); diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp index 3da80f4fc875..b87c884c9e4a 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.cpp @@ -18,6 +18,31 @@ #include "llvm/MC/MCContext.h" using namespace llvm; +// Exception handling & setjmp-longjmp handling related options. These are +// defined here to be shared between WebAssembly and its subdirectories. + +// Emscripten's asm.js-style exception handling +cl::opt<bool> WebAssembly::WasmEnableEmEH( + "enable-emscripten-cxx-exceptions", + cl::desc("WebAssembly Emscripten-style exception handling"), + cl::init(false)); +// Emscripten's asm.js-style setjmp/longjmp handling +cl::opt<bool> WebAssembly::WasmEnableEmSjLj( + "enable-emscripten-sjlj", + cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"), + cl::init(false)); +// Exception handling using wasm EH instructions +cl::opt<bool> + WebAssembly::WasmEnableEH("wasm-enable-eh", + cl::desc("WebAssembly exception handling"), + cl::init(false)); +// setjmp/longjmp handling using wasm EH instrutions +cl::opt<bool> + WebAssembly::WasmEnableSjLj("wasm-enable-sjlj", + cl::desc("WebAssembly setjmp/longjmp handling"), + cl::init(false)); + +// Function names in libc++abi and libunwind const char *const WebAssembly::CxaBeginCatchFn = "__cxa_begin_catch"; const char *const WebAssembly::CxaRethrowFn = "__cxa_rethrow"; const char *const WebAssembly::StdTerminateFn = "_ZSt9terminatev"; diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h index f6e96d9b2877..d024185defb4 100644 --- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h +++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_WEBASSEMBLY_UTILS_WEBASSEMBLYUTILITIES_H #include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/CommandLine.h" namespace llvm { @@ -70,6 +71,12 @@ inline bool isRefType(const Type *Ty) { bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI); bool mayThrow(const MachineInstr &MI); +// Exception handling / setjmp-longjmp handling command-line options +extern cl::opt<bool> WasmEnableEmEH; // asm.js-style EH +extern cl::opt<bool> WasmEnableEmSjLj; // asm.js-style SjLJ +extern cl::opt<bool> WasmEnableEH; // EH using Wasm EH instructions +extern cl::opt<bool> WasmEnableSjLj; // SjLj using Wasm EH instructions + // Exception-related function names extern const char *const ClangCallTerminateFn; extern const char *const CxaBeginCatchFn; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 0d3f51693261..e3af6b2662ef 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -51,8 +51,6 @@ using namespace llvm; #define DEBUG_TYPE "asm-printer" extern cl::opt<bool> WasmKeepRegisters; -extern cl::opt<bool> WasmEnableEmEH; -extern cl::opt<bool> WasmEnableEmSjLj; //===----------------------------------------------------------------------===// // Helpers. @@ -196,6 +194,13 @@ void WebAssemblyAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) { Sym->setGlobalType(wasm::WasmGlobalType{uint8_t(Type), Mutable}); } + // If the GlobalVariable refers to a table, we handle it here instead of + // in emitExternalDecls + if (Sym->isTable()) { + getTargetStreamer()->emitTableType(Sym); + return; + } + emitVisibility(Sym, GV->getVisibility(), !GV->isDeclaration()); if (GV->hasInitializer()) { assert(getSymbolPreferLocal(*GV) == Sym); @@ -315,8 +320,9 @@ void WebAssemblyAsmPrinter::emitExternalDecls(const Module &M) { // will discard it later if it turns out not to be necessary. auto Signature = signatureFromMVTs(Results, Params); bool InvokeDetected = false; - auto *Sym = getMCSymbolForFunction(&F, WasmEnableEmEH || WasmEnableEmSjLj, - Signature.get(), InvokeDetected); + auto *Sym = getMCSymbolForFunction( + &F, WebAssembly::WasmEnableEmEH || WebAssembly::WasmEnableEmSjLj, + Signature.get(), InvokeDetected); // Multiple functions can be mapped to the same invoke symbol. For // example, two IR functions '__invoke_void_i8*' and '__invoke_void_i32' diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 7832f199a2cc..17e867e4c7d8 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -1741,7 +1741,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { void WebAssemblyCFGStackify::cleanupFunctionData(MachineFunction &MF) { if (FakeCallerBB) - MF.DeleteMachineBasicBlock(FakeCallerBB); + MF.deleteMachineBasicBlock(FakeCallerBB); AppendixBB = FakeCallerBB = nullptr; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def index 1fa0ea3867c7..a3a33f4a5b3a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -31,6 +31,7 @@ HANDLE_NODETYPE(SWIZZLE) HANDLE_NODETYPE(VEC_SHL) HANDLE_NODETYPE(VEC_SHR_S) HANDLE_NODETYPE(VEC_SHR_U) +HANDLE_NODETYPE(NARROW_U) HANDLE_NODETYPE(EXTEND_LOW_S) HANDLE_NODETYPE(EXTEND_LOW_U) HANDLE_NODETYPE(EXTEND_HIGH_S) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 0df8f3e0e09c..38ed4c73fb93 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -176,6 +176,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::CONCAT_VECTORS); + setTargetDAGCombine(ISD::TRUNCATE); + // Support saturating add for i8x16 and i16x8 for (auto Op : {ISD::SADDSAT, ISD::UADDSAT}) for (auto T : {MVT::v16i8, MVT::v8i16}) @@ -644,8 +646,7 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB, Register RegFuncref = MF.getRegInfo().createVirtualRegister(&WebAssembly::FUNCREFRegClass); MachineInstr *RefNull = - BuildMI(MF, DL, TII.get(WebAssembly::REF_NULL_FUNCREF), RegFuncref) - .addImm(static_cast<int32_t>(WebAssembly::HeapType::Funcref)); + BuildMI(MF, DL, TII.get(WebAssembly::REF_NULL_FUNCREF), RegFuncref); BB->insertAfter(Const0->getIterator(), RefNull); MachineInstr *TableSet = @@ -2610,6 +2611,114 @@ performVectorTruncZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return DAG.getNode(Op, SDLoc(N), ResVT, Source); } +// Helper to extract VectorWidth bits from Vec, starting from IdxVal. +static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, + const SDLoc &DL, unsigned VectorWidth) { + EVT VT = Vec.getValueType(); + EVT ElVT = VT.getVectorElementType(); + unsigned Factor = VT.getSizeInBits() / VectorWidth; + EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, + VT.getVectorNumElements() / Factor); + + // Extract the relevant VectorWidth bits. Generate an EXTRACT_SUBVECTOR + unsigned ElemsPerChunk = VectorWidth / ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); + + // This is the index of the first element of the VectorWidth-bit chunk + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); + + // If the input is a buildvector just emit a smaller one. + if (Vec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getBuildVector(ResultVT, DL, + Vec->ops().slice(IdxVal, ElemsPerChunk)); + + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, DL); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, VecIdx); +} + +// Helper to recursively truncate vector elements in half with NARROW_U. DstVT +// is the expected destination value type after recursion. In is the initial +// input. Note that the input should have enough leading zero bits to prevent +// NARROW_U from saturating results. +static SDValue truncateVectorWithNARROW(EVT DstVT, SDValue In, const SDLoc &DL, + SelectionDAG &DAG) { + EVT SrcVT = In.getValueType(); + + // No truncation required, we might get here due to recursive calls. + if (SrcVT == DstVT) + return In; + + unsigned SrcSizeInBits = SrcVT.getSizeInBits(); + unsigned NumElems = SrcVT.getVectorNumElements(); + if (!isPowerOf2_32(NumElems)) + return SDValue(); + assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation"); + assert(SrcSizeInBits > DstVT.getSizeInBits() && "Illegal truncation"); + + LLVMContext &Ctx = *DAG.getContext(); + EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2); + + // Narrow to the largest type possible: + // vXi64/vXi32 -> i16x8.narrow_i32x4_u and vXi16 -> i8x16.narrow_i16x8_u. + EVT InVT = MVT::i16, OutVT = MVT::i8; + if (SrcVT.getScalarSizeInBits() > 16) { + InVT = MVT::i32; + OutVT = MVT::i16; + } + unsigned SubSizeInBits = SrcSizeInBits / 2; + InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); + OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits()); + + // Split lower/upper subvectors. + SDValue Lo = extractSubVector(In, 0, DAG, DL, SubSizeInBits); + SDValue Hi = extractSubVector(In, NumElems / 2, DAG, DL, SubSizeInBits); + + // 256bit -> 128bit truncate - Narrow lower/upper 128-bit subvectors. + if (SrcVT.is256BitVector() && DstVT.is128BitVector()) { + Lo = DAG.getBitcast(InVT, Lo); + Hi = DAG.getBitcast(InVT, Hi); + SDValue Res = DAG.getNode(WebAssemblyISD::NARROW_U, DL, OutVT, Lo, Hi); + return DAG.getBitcast(DstVT, Res); + } + + // Recursively narrow lower/upper subvectors, concat result and narrow again. + EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2); + Lo = truncateVectorWithNARROW(PackedVT, Lo, DL, DAG); + Hi = truncateVectorWithNARROW(PackedVT, Hi, DL, DAG); + + PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi); + return truncateVectorWithNARROW(DstVT, Res, DL, DAG); +} + +static SDValue performTruncateCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + + SDValue In = N->getOperand(0); + EVT InVT = In.getValueType(); + if (!InVT.isSimple()) + return SDValue(); + + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + EVT OutSVT = OutVT.getVectorElementType(); + EVT InSVT = InVT.getVectorElementType(); + // Currently only cover truncate to v16i8 or v8i16. + if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) && + (OutSVT == MVT::i8 || OutSVT == MVT::i16) && OutVT.is128BitVector())) + return SDValue(); + + SDLoc DL(N); + APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(), + OutVT.getScalarSizeInBits()); + In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT)); + return truncateVectorWithNARROW(OutVT, In, DL, DAG); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -2626,5 +2735,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::FP_ROUND: case ISD::CONCAT_VECTORS: return performVectorTruncZeroCombine(N, DCI); + case ISD::TRUNCATE: + return performTruncateCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index ee9247a8bef9..3fb0af1d47a0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -202,11 +202,6 @@ def Signature : Operand<i32> { let PrintMethod = "printWebAssemblySignatureOperand"; } -let OperandType = "OPERAND_HEAPTYPE" in -def HeapType : Operand<i32> { - let PrintMethod = "printWebAssemblyHeapTypeOperand"; -} - let OperandType = "OPERAND_TYPEINDEX" in def TypeIndex : Operand<i32>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td index ef9bd35d004a..76a88caafc47 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td @@ -11,13 +11,14 @@ /// //===----------------------------------------------------------------------===// -multiclass REF_I<WebAssemblyRegClass rc, ValueType vt> { - defm REF_NULL_#rc : I<(outs rc:$res), (ins HeapType:$heaptype), - (outs), (ins HeapType:$heaptype), - [], - "ref.null\t$res, $heaptype", - "ref.null\t$heaptype", - 0xd0>, +multiclass REF_I<WebAssemblyRegClass rc, ValueType vt, string ht> { + defm REF_NULL_#rc : I<(outs rc:$dst), (ins), + (outs), (ins), + [(set rc:$dst, (!cast<Intrinsic>("int_wasm_ref_null_" # ht)))], + "ref.null_" # ht # "$dst", + "ref.null_" # ht, + !cond(!eq(ht, "func") : 0xd070, + !eq(ht, "extern") : 0xd06f)>, Requires<[HasReferenceTypes]>; defm SELECT_#rc: I<(outs rc:$dst), (ins rc:$lhs, rc:$rhs, I32:$cond), (outs), (ins), @@ -28,8 +29,8 @@ multiclass REF_I<WebAssemblyRegClass rc, ValueType vt> { Requires<[HasReferenceTypes]>; } -defm "" : REF_I<FUNCREF, funcref>; -defm "" : REF_I<EXTERNREF, externref>; +defm "" : REF_I<FUNCREF, funcref, "func">; +defm "" : REF_I<EXTERNREF, externref, "extern">; foreach rc = [FUNCREF, EXTERNREF] in { def : Pat<(select (i32 (setne I32:$cond, 0)), rc:$lhs, rc:$rhs), diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 30b99c3a69a9..5bb12c7fbdc7 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1278,6 +1278,14 @@ multiclass SIMDNarrow<Vec vec, bits<32> baseInst> { defm "" : SIMDNarrow<I16x8, 101>; defm "" : SIMDNarrow<I32x4, 133>; +// WebAssemblyISD::NARROW_U +def wasm_narrow_t : SDTypeProfile<1, 2, []>; +def wasm_narrow_u : SDNode<"WebAssemblyISD::NARROW_U", wasm_narrow_t>; +def : Pat<(v16i8 (wasm_narrow_u (v8i16 V128:$left), (v8i16 V128:$right))), + (NARROW_U_I8x16 $left, $right)>; +def : Pat<(v8i16 (wasm_narrow_u (v4i32 V128:$left), (v4i32 V128:$right))), + (NARROW_U_I16x8 $left, $right)>; + // Bitcasts are nops // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types foreach t1 = AllVecs in diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td index e44c2073eaeb..1fd00bf1cbc8 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrTable.td @@ -20,7 +20,7 @@ def WebAssemblyTableGet : SDNode<"WebAssemblyISD::TABLE_GET", WebAssemblyTableGe [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; -multiclass TABLE<WebAssemblyRegClass rc> { +multiclass TABLE<WebAssemblyRegClass rc, string suffix> { let mayLoad = 1 in defm TABLE_GET_#rc : I<(outs rc:$res), (ins table32_op:$table, I32:$i), (outs), (ins table32_op:$table), @@ -39,14 +39,14 @@ multiclass TABLE<WebAssemblyRegClass rc> { defm TABLE_GROW_#rc : I<(outs I32:$sz), (ins table32_op:$table, rc:$val, I32:$n), (outs), (ins table32_op:$table), - [], + [(set I32:$sz, (!cast<Intrinsic>("int_wasm_table_grow_" # suffix) (WebAssemblyWrapper tglobaladdr:$table), rc:$val, I32:$n))], "table.grow\t$sz, $table, $val, $n", "table.grow\t$table", 0xfc0f>; defm TABLE_FILL_#rc : I<(outs), (ins table32_op:$table, I32:$i, rc:$val, I32:$n), (outs), (ins table32_op:$table), - [], + [(!cast<Intrinsic>("int_wasm_table_fill_" # suffix) (WebAssemblyWrapper tglobaladdr:$table), I32:$i, rc:$val, I32:$n)], "table.fill\t$table, $i, $val, $n", "table.fill\t$table", 0xfc11>; @@ -62,8 +62,8 @@ multiclass TABLE<WebAssemblyRegClass rc> { } } -defm "" : TABLE<FUNCREF>, Requires<[HasReferenceTypes]>; -defm "" : TABLE<EXTERNREF>, Requires<[HasReferenceTypes]>; +defm "" : TABLE<FUNCREF, "funcref">, Requires<[HasReferenceTypes]>; +defm "" : TABLE<EXTERNREF, "externref">, Requires<[HasReferenceTypes]>; def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r), (TABLE_SET_FUNCREF mcsym:$table, i32:$idx, funcref:$r)>, @@ -71,7 +71,7 @@ def : Pat<(WebAssemblyTableSet mcsym:$table, i32:$idx, funcref:$r), defm TABLE_SIZE : I<(outs I32:$sz), (ins table32_op:$table), (outs), (ins table32_op:$table), - [], + [(set I32:$sz, (int_wasm_table_size (WebAssemblyWrapper tglobaladdr:$table)))], "table.size\t$sz, $table", "table.size\t$table", 0xfc10>, @@ -80,7 +80,9 @@ defm TABLE_SIZE : I<(outs I32:$sz), (ins table32_op:$table), defm TABLE_COPY : I<(outs), (ins table32_op:$table1, table32_op:$table2, I32:$d, I32:$s, I32:$n), (outs), (ins table32_op:$table1, table32_op:$table2), - [], + [(int_wasm_table_copy (WebAssemblyWrapper tglobaladdr:$table1), + (WebAssemblyWrapper tglobaladdr:$table2), + I32:$d, I32:$s, I32:$n)], "table.copy\t$table1, $table2, $d, $s, $n", "table.copy\t$table1, $table2", 0xfc0e>, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 4eacc921b6cd..23aaa5160abd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -267,6 +267,7 @@ /// ///===----------------------------------------------------------------------===// +#include "Utils/WebAssemblyUtilities.h" #include "WebAssembly.h" #include "WebAssemblyTargetMachine.h" #include "llvm/ADT/StringExtras.h" @@ -285,13 +286,6 @@ using namespace llvm; #define DEBUG_TYPE "wasm-lower-em-ehsjlj" -// Emscripten's asm.js-style exception handling -extern cl::opt<bool> WasmEnableEmEH; -// Emscripten's asm.js-style setjmp/longjmp handling -extern cl::opt<bool> WasmEnableEmSjLj; -// Wasm setjmp/longjmp handling using wasm EH instructions -extern cl::opt<bool> WasmEnableSjLj; - static cl::list<std::string> EHAllowlist("emscripten-cxx-exceptions-allowed", cl::desc("The list of function names in which Emscripten-style " @@ -370,8 +364,9 @@ public: static char ID; WebAssemblyLowerEmscriptenEHSjLj() - : ModulePass(ID), EnableEmEH(WasmEnableEmEH), - EnableEmSjLj(WasmEnableEmSjLj), EnableWasmSjLj(WasmEnableSjLj) { + : ModulePass(ID), EnableEmEH(WebAssembly::WasmEnableEmEH), + EnableEmSjLj(WebAssembly::WasmEnableEmSjLj), + EnableWasmSjLj(WebAssembly::WasmEnableSjLj) { assert(!(EnableEmSjLj && EnableWasmSjLj) && "Two SjLj modes cannot be turned on at the same time"); assert(!(EnableEmEH && EnableWasmSjLj) && diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 0b953a90aeab..09bccef17ab0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -40,9 +40,6 @@ cl::opt<bool> " instruction output for test purposes only."), cl::init(false)); -extern cl::opt<bool> WasmEnableEmEH; -extern cl::opt<bool> WasmEnableEmSjLj; - static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI); MCSymbol * @@ -66,9 +63,11 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { // they reach this point as aggregate Array types with an element type // that is a reference type. wasm::ValType Type; + bool IsTable = false; if (GlobalVT->isArrayTy() && WebAssembly::isRefType(GlobalVT->getArrayElementType())) { MVT VT; + IsTable = true; switch (GlobalVT->getArrayElementType()->getPointerAddressSpace()) { case WebAssembly::WasmAddressSpace::WASM_ADDRESS_SPACE_FUNCREF: VT = MVT::funcref; @@ -85,9 +84,14 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { } else report_fatal_error("Aggregate globals not yet implemented"); - WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); - WasmSym->setGlobalType( - wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true}); + if (IsTable) { + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); + WasmSym->setTableType(Type); + } else { + WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL); + WasmSym->setGlobalType( + wasm::WasmGlobalType{uint8_t(Type), /*Mutable=*/true}); + } } return WasmSym; } @@ -105,7 +109,8 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { bool InvokeDetected = false; auto *WasmSym = Printer.getMCSymbolForFunction( - F, WasmEnableEmEH || WasmEnableEmSjLj, Signature.get(), InvokeDetected); + F, WebAssembly::WasmEnableEmEH || WebAssembly::WasmEnableEmSjLj, + Signature.get(), InvokeDetected); WasmSym->setSignature(Signature.get()); Printer.addSignature(std::move(Signature)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); @@ -275,11 +280,6 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI, SmallVector<wasm::ValType, 4>()); break; } - } else if (Info.OperandType == WebAssembly::OPERAND_HEAPTYPE) { - assert(static_cast<WebAssembly::HeapType>(MO.getImm()) != - WebAssembly::HeapType::Invalid); - // With typed function references, this will need a case for type - // index operands. Otherwise, fall through. } } MCOp = MCOperand::createImm(MO.getImm()); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 7b70d99b5f52..482837178f3d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -14,6 +14,7 @@ #include "WebAssemblyTargetMachine.h" #include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "TargetInfo/WebAssemblyTargetInfo.h" +#include "Utils/WebAssemblyUtilities.h" #include "WebAssembly.h" #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblyTargetObjectFile.h" @@ -24,6 +25,7 @@ #include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/Scalar.h" @@ -33,28 +35,6 @@ using namespace llvm; #define DEBUG_TYPE "wasm" -// Emscripten's asm.js-style exception handling -cl::opt<bool> - WasmEnableEmEH("enable-emscripten-cxx-exceptions", - cl::desc("WebAssembly Emscripten-style exception handling"), - cl::init(false)); - -// Emscripten's asm.js-style setjmp/longjmp handling -cl::opt<bool> WasmEnableEmSjLj( - "enable-emscripten-sjlj", - cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"), - cl::init(false)); - -// Exception handling using wasm EH instructions -cl::opt<bool> WasmEnableEH("wasm-enable-eh", - cl::desc("WebAssembly exception handling"), - cl::init(false)); - -// setjmp/longjmp handling using wasm EH instrutions -cl::opt<bool> WasmEnableSjLj("wasm-enable-sjlj", - cl::desc("WebAssembly setjmp/longjmp handling"), - cl::init(false)); - // A command-line option to keep implicit locals // for the purpose of testing with lit/llc ONLY. // This produces output which is not valid WebAssembly, and is not supported @@ -368,7 +348,23 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { return nullptr; // No reg alloc } -static void basicCheckForEHAndSjLj(const TargetMachine *TM) { +using WebAssembly::WasmEnableEH; +using WebAssembly::WasmEnableEmEH; +using WebAssembly::WasmEnableEmSjLj; +using WebAssembly::WasmEnableSjLj; + +static void basicCheckForEHAndSjLj(TargetMachine *TM) { + // Before checking, we make sure TargetOptions.ExceptionModel is the same as + // MCAsmInfo.ExceptionsType. Normally these have to be the same, because clang + // stores the exception model info in LangOptions, which is later transferred + // to TargetOptions and MCAsmInfo. But when clang compiles bitcode directly, + // clang's LangOptions is not used and thus the exception model info is not + // correctly transferred to TargetOptions and MCAsmInfo, so we make sure we + // have the correct exception model in in WebAssemblyMCAsmInfo constructor. + // But in this case TargetOptions is still not updated, so we make sure they + // are the same. + TM->Options.ExceptionModel = TM->getMCAsmInfo()->getExceptionHandlingType(); + // Basic Correctness checking related to -exception-model if (TM->Options.ExceptionModel != ExceptionHandling::None && TM->Options.ExceptionModel != ExceptionHandling::Wasm) diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index 8ce6b47d10e8..2ba0b97229cc 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1759,7 +1759,8 @@ bool X86AsmParser::CreateMemForMSInlineAsm( // registers in a mmory expression, and though unaccessible via rip/eip. if (IsGlobalLV && (BaseReg || IndexReg)) { Operands.push_back(X86Operand::CreateMem(getPointerWidth(), Disp, Start, - End, Size, Identifier, Decl)); + End, Size, Identifier, Decl, + FrontendSize)); return false; } // Otherwise, we set the base register to a non-zero value @@ -2551,8 +2552,6 @@ bool X86AsmParser::ParseIntelOperand(OperandVector &Operands) { StringRef ErrMsg; unsigned BaseReg = SM.getBaseReg(); unsigned IndexReg = SM.getIndexReg(); - if (IndexReg && BaseReg == X86::RIP) - BaseReg = 0; unsigned Scale = SM.getScale(); if (!PtrInOperand) Size = SM.getElementSize() << 3; @@ -4430,8 +4429,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // If exactly one matched, then we treat that as a successful match (and the // instruction will already have been filled in correctly, since the failing // matches won't have modified it). - unsigned NumSuccessfulMatches = - std::count(std::begin(Match), std::end(Match), Match_Success); + unsigned NumSuccessfulMatches = llvm::count(Match, Match_Success); if (NumSuccessfulMatches == 1) { if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) return true; @@ -4479,7 +4477,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // If all of the instructions reported an invalid mnemonic, then the original // mnemonic was invalid. - if (std::count(std::begin(Match), std::end(Match), Match_MnemonicFail) == 4) { + if (llvm::count(Match, Match_MnemonicFail) == 4) { if (OriginalError == Match_MnemonicFail) return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'", Op.getLocRange(), MatchingInlineAsm); @@ -4508,16 +4506,14 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, } // If one instruction matched as unsupported, report this as unsupported. - if (std::count(std::begin(Match), std::end(Match), - Match_Unsupported) == 1) { + if (llvm::count(Match, Match_Unsupported) == 1) { return Error(IDLoc, "unsupported instruction", EmptyRange, MatchingInlineAsm); } // If one instruction matched with a missing feature, report this as a // missing feature. - if (std::count(std::begin(Match), std::end(Match), - Match_MissingFeature) == 1) { + if (llvm::count(Match, Match_MissingFeature) == 1) { ErrorInfo = Match_MissingFeature; return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures, MatchingInlineAsm); @@ -4525,8 +4521,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, // If one instruction matched with an invalid operand, report this as an // operand failure. - if (std::count(std::begin(Match), std::end(Match), - Match_InvalidOperand) == 1) { + if (llvm::count(Match, Match_InvalidOperand) == 1) { return Error(IDLoc, "invalid operand for instruction", EmptyRange, MatchingInlineAsm); } @@ -4674,8 +4669,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, Op.getLocRange(), MatchingInlineAsm); } - unsigned NumSuccessfulMatches = - std::count(std::begin(Match), std::end(Match), Match_Success); + unsigned NumSuccessfulMatches = llvm::count(Match, Match_Success); // If matching was ambiguous and we had size information from the frontend, // try again with that. This handles cases like "movxz eax, m8/m16". @@ -4721,16 +4715,14 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, } // If one instruction matched as unsupported, report this as unsupported. - if (std::count(std::begin(Match), std::end(Match), - Match_Unsupported) == 1) { + if (llvm::count(Match, Match_Unsupported) == 1) { return Error(IDLoc, "unsupported instruction", EmptyRange, MatchingInlineAsm); } // If one instruction matched with a missing feature, report this as a // missing feature. - if (std::count(std::begin(Match), std::end(Match), - Match_MissingFeature) == 1) { + if (llvm::count(Match, Match_MissingFeature) == 1) { ErrorInfo = Match_MissingFeature; return ErrorMissingFeature(IDLoc, ErrorInfoMissingFeatures, MatchingInlineAsm); @@ -4738,14 +4730,12 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // If one instruction matched with an invalid operand, report this as an // operand failure. - if (std::count(std::begin(Match), std::end(Match), - Match_InvalidOperand) == 1) { + if (llvm::count(Match, Match_InvalidOperand) == 1) { return Error(IDLoc, "invalid operand for instruction", EmptyRange, MatchingInlineAsm); } - if (std::count(std::begin(Match), std::end(Match), - Match_InvalidImmUnsignedi4) == 1) { + if (llvm::count(Match, Match_InvalidImmUnsignedi4) == 1) { SMLoc ErrorLoc = ((X86Operand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h index 9164c699b569..67b1244708a8 100644 --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -285,6 +285,12 @@ struct X86Operand final : public MCParsedAsmOperand { bool isOffsetOfLocal() const override { return isImm() && Imm.LocalRef; } + bool isMemPlaceholder(const MCInstrDesc &Desc) const override { + // Only MS InlineAsm uses global variables with registers rather than + // rip/eip. + return isMem() && !Mem.DefaultBaseReg && Mem.FrontendSize; + } + bool needAddressOf() const override { return AddressOf; } bool isMem() const override { return Kind == Memory; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index b51011e2c52f..a903c5f455a2 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -948,39 +948,39 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, break; CASE_UNPCK(PUNPCKHBW, r) - case X86::MMX_PUNPCKHBWirr: + case X86::MMX_PUNPCKHBWrr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; LLVM_FALLTHROUGH; CASE_UNPCK(PUNPCKHBW, m) - case X86::MMX_PUNPCKHBWirm: + case X86::MMX_PUNPCKHBWrm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKHMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask); break; CASE_UNPCK(PUNPCKHWD, r) - case X86::MMX_PUNPCKHWDirr: + case X86::MMX_PUNPCKHWDrr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; LLVM_FALLTHROUGH; CASE_UNPCK(PUNPCKHWD, m) - case X86::MMX_PUNPCKHWDirm: + case X86::MMX_PUNPCKHWDrm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKHMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask); break; CASE_UNPCK(PUNPCKHDQ, r) - case X86::MMX_PUNPCKHDQirr: + case X86::MMX_PUNPCKHDQrr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; LLVM_FALLTHROUGH; CASE_UNPCK(PUNPCKHDQ, m) - case X86::MMX_PUNPCKHDQirm: + case X86::MMX_PUNPCKHDQrm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKHMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); @@ -998,39 +998,39 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, break; CASE_UNPCK(PUNPCKLBW, r) - case X86::MMX_PUNPCKLBWirr: + case X86::MMX_PUNPCKLBWrr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; LLVM_FALLTHROUGH; CASE_UNPCK(PUNPCKLBW, m) - case X86::MMX_PUNPCKLBWirm: + case X86::MMX_PUNPCKLBWrm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKLMask(getRegOperandNumElts(MI, 8, 0), 8, ShuffleMask); break; CASE_UNPCK(PUNPCKLWD, r) - case X86::MMX_PUNPCKLWDirr: + case X86::MMX_PUNPCKLWDrr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; LLVM_FALLTHROUGH; CASE_UNPCK(PUNPCKLWD, m) - case X86::MMX_PUNPCKLWDirm: + case X86::MMX_PUNPCKLWDrm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKLMask(getRegOperandNumElts(MI, 16, 0), 16, ShuffleMask); break; CASE_UNPCK(PUNPCKLDQ, r) - case X86::MMX_PUNPCKLDQirr: + case X86::MMX_PUNPCKLDQrr: Src2Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); RegForm = true; LLVM_FALLTHROUGH; CASE_UNPCK(PUNPCKLDQ, m) - case X86::MMX_PUNPCKLDQirm: + case X86::MMX_PUNPCKLDQrm: Src1Name = getRegName(MI->getOperand(NumOperands-(RegForm?2:6)).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeUNPCKLMask(getRegOperandNumElts(MI, 32, 0), 32, ShuffleMask); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp index 11251fb2b2ba..bf3f4e990ecc 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -236,7 +236,7 @@ bool X86WinCOFFTargetStreamer::emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) { bool X86WinCOFFTargetStreamer::emitFPOStackAlign(unsigned Align, SMLoc L) { if (checkInFPOPrologue(L)) return true; - if (!llvm::any_of(CurFPOData->Instructions, [](const FPOInstruction &Inst) { + if (llvm::none_of(CurFPOData->Instructions, [](const FPOInstruction &Inst) { return Inst.Op == FPOInstruction::SetFrame; })) { getContext().reportError( diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp index 2e08482e4ff6..d48b8e458219 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -754,8 +754,6 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) { void X86AsmPrinter::emitEndOfAsmFile(Module &M) { const Triple &TT = TM.getTargetTriple(); - emitAsanMemaccessSymbols(M); - if (TT.isOSBinFormatMachO()) { // Mach-O uses non-lazy symbol stubs to encode per-TU information into // global table for symbol lookup. diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h index 3b0983a7d935..b22f25af26cf 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/llvm/lib/Target/X86/X86AsmPrinter.h @@ -31,6 +31,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { FaultMaps FM; std::unique_ptr<MCCodeEmitter> CodeEmitter; bool EmitFPOData = false; + bool ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = false; // This utility class tracks the length of a stackmap instruction's 'shadow'. // It is used by the X86AsmPrinter to ensure that the stackmap shadow @@ -100,20 +101,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // Address sanitizer specific lowering for X86. void LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI); - void emitAsanMemaccessSymbols(Module &M); - void emitAsanMemaccessPartial(Module &M, unsigned Reg, - const ASanAccessInfo &AccessInfo, - MCSubtargetInfo &STI); - void emitAsanMemaccessFull(Module &M, unsigned Reg, - const ASanAccessInfo &AccessInfo, - MCSubtargetInfo &STI); - void emitAsanReportError(Module &M, unsigned Reg, - const ASanAccessInfo &AccessInfo, - MCSubtargetInfo &STI); - - typedef std::tuple<unsigned /*Reg*/, uint32_t /*AccessInfo*/> - AsanMemaccessTuple; - std::map<AsanMemaccessTuple, MCSymbol *> AsanMemaccessSymbols; // Choose between emitting .seh_ directives and .cv_fpo_ directives. void EmitSEHInstruction(const MachineInstr *MI); @@ -165,6 +152,10 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; void emitFunctionBodyStart() override; void emitFunctionBodyEnd() override; + + bool shouldEmitWeakSwiftAsyncExtendedFramePointerFlags() const override { + return ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags; + } }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp index 863438793acf..96d3d1390a59 100644 --- a/llvm/lib/Target/X86/X86CmovConversion.cpp +++ b/llvm/lib/Target/X86/X86CmovConversion.cpp @@ -186,7 +186,7 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) { for (auto &Group : AllCmovGroups) { // Skip any group that doesn't do at least one memory operand cmov. - if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); })) + if (llvm::none_of(Group, [&](MachineInstr *I) { return I->mayLoad(); })) continue; // For CMOV groups which we can rewrite and which contain a memory load, diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 93bc23006dc4..6a047838f0b5 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -191,8 +191,6 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { // Expand CALL_RVMARKER pseudo to call instruction, followed by the special //"movq %rax, %rdi" marker. - // TODO: Mark the sequence as bundle, to avoid passes moving other code - // in between. MachineInstr &MI = *MBBI; MachineInstr *OriginalCall; @@ -236,15 +234,23 @@ void X86ExpandPseudo::expandCALL_RVMARKER(MachineBasicBlock &MBB, // Emit call to ObjC runtime. const uint32_t *RegMask = TRI->getCallPreservedMask(*MBB.getParent(), CallingConv::C); - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(X86::CALL64pcrel32)) - .addGlobalAddress(MI.getOperand(0).getGlobal(), 0, 0) - .addRegMask(RegMask) - .addReg(X86::RAX, - RegState::Implicit | - (RAXImplicitDead ? (RegState::Dead | RegState::Define) - : RegState::Define)) - .getInstr(); + MachineInstr *RtCall = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(X86::CALL64pcrel32)) + .addGlobalAddress(MI.getOperand(0).getGlobal(), 0, 0) + .addRegMask(RegMask) + .addReg(X86::RAX, + RegState::Implicit | + (RAXImplicitDead ? (RegState::Dead | RegState::Define) + : RegState::Define)) + .getInstr(); MI.eraseFromParent(); + + auto &TM = MBB.getParent()->getTarget(); + // On Darwin platforms, wrap the expanded sequence in a bundle to prevent + // later optimizations from breaking up the sequence. + if (TM.getTargetTriple().isOSDarwin()) + finalizeBundle(MBB, OriginalCall->getIterator(), + std::next(RtCall->getIterator())); } /// If \p MBBI is a pseudo instruction, this method expands diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 87c04a07cd13..47874e82ff3b 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -134,11 +134,7 @@ bool X86FastTileConfig::isAMXInstr(MachineInstr &MI) { if (MI.getOpcode() == X86::PLDTILECFGV || MI.isDebugInstr()) return false; - for (MachineOperand &MO : MI.operands()) - if (isTilePhysReg(MO)) - return true; - - return false; + return llvm::any_of(MI.operands(), isTilePhysReg); } MachineInstr *X86FastTileConfig::getKeyAMXInstr(MachineInstr *MI) { diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp index e1d4b4c34772..16bff201dd03 100644 --- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp +++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp @@ -457,14 +457,12 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF, OptForSize = MF.getFunction().hasOptSize() || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI); - for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { - MachineInstr *MI = &*I; - - if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB)) - MIReplacements.push_back(std::make_pair(MI, NewMI)); + for (MachineInstr &MI : llvm::reverse(MBB)) { + if (MachineInstr *NewMI = tryReplaceInstr(&MI, MBB)) + MIReplacements.push_back(std::make_pair(&MI, NewMI)); // We're done with this instruction, update liveness for the next one. - LiveRegs.stepBackward(*MI); + LiveRegs.stepBackward(MI); } while (!MIReplacements.empty()) { diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp index 4d9160f35226..2f0ab4ca9de4 100644 --- a/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -1442,7 +1442,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { assert(UpdatedSlot < StackTop && Dest < 7); Stack[UpdatedSlot] = Dest; RegMap[Dest] = UpdatedSlot; - MBB->getParent()->DeleteMachineInstr(&MI); // Remove the old instruction + MBB->getParent()->deleteMachineInstr(&MI); // Remove the old instruction } /// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index c29ae9f6af4c..0a7aea467809 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2496,8 +2496,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( } // Assign slots for GPRs. It increases frame size. - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i - 1].getReg(); + for (CalleeSavedInfo &I : llvm::reverse(CSI)) { + unsigned Reg = I.getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; @@ -2506,15 +2506,15 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( CalleeSavedFrameSize += SlotSize; int SlotIndex = MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); - CSI[i - 1].setFrameIdx(SlotIndex); + I.setFrameIdx(SlotIndex); } X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize); // Assign slots for XMMs. - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i - 1].getReg(); + for (CalleeSavedInfo &I : llvm::reverse(CSI)) { + unsigned Reg = I.getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; @@ -2533,7 +2533,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( // spill into slot SpillSlotOffset -= Size; int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); - CSI[i - 1].setFrameIdx(SlotIndex); + I.setFrameIdx(SlotIndex); MFI.ensureMaxAlignment(Alignment); // Save the start offset and size of XMM in stack frame for funclets. @@ -2559,8 +2559,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters( // Push GPRs. It increases frame size. const MachineFunction &MF = *MBB.getParent(); unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i - 1].getReg(); + for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { + unsigned Reg = I.getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; @@ -2593,8 +2593,8 @@ bool X86FrameLowering::spillCalleeSavedRegisters( // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. - for (unsigned i = CSI.size(); i != 0; --i) { - unsigned Reg = CSI[i-1].getReg(); + for (const CalleeSavedInfo &I : llvm::reverse(CSI)) { + unsigned Reg = I.getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; @@ -2607,8 +2607,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters( MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, - TRI); + TII.storeRegToStackSlot(MBB, MI, Reg, true, I.getFrameIdx(), RC, TRI); --MI; MI->setFlag(MachineInstr::FrameSetup); ++MI; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 62b2387396be..6f6361b6757b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1091,17 +1091,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); + if (VT == MVT::v2i64) continue; + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); } - setOperationAction(ISD::ROTL, MVT::v4i32, Custom); - setOperationAction(ISD::ROTL, MVT::v8i16, Custom); - - // With 512-bit registers or AVX512VL+BW, expanding (and promoting the - // shifts) is better. - if (!Subtarget.useAVX512Regs() && - !(Subtarget.hasBWI() && Subtarget.hasVLX())) - setOperationAction(ISD::ROTL, MVT::v16i8, Custom); - setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); @@ -1199,8 +1193,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, - MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) + MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); + } // XOP can efficiently perform BITREVERSE with VPPERM. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) @@ -1283,6 +1279,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); + if (VT == MVT::v4i64) continue; + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); } // These types need custom splitting if their input is a 128-bit vector. @@ -1291,13 +1290,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); - setOperationAction(ISD::ROTL, MVT::v8i32, Custom); - setOperationAction(ISD::ROTL, MVT::v16i16, Custom); - - // With BWI, expanding (and promoting the shifts) is the better. - if (!Subtarget.useBWIRegs()) - setOperationAction(ISD::ROTL, MVT::v32i8, Custom); - setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8i32, Custom); @@ -1662,6 +1654,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); // The condition codes aren't legal in SSE/AVX and under AVX512 we use @@ -1676,16 +1670,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, VT, Legal); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); - setOperationAction(ISD::ROTL, VT, Custom); - setOperationAction(ISD::ROTR, VT, Custom); setOperationAction(ISD::STRICT_FSETCC, VT, Custom); setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); } - // With BWI, expanding (and promoting the shifts) is the better. - if (!Subtarget.useBWIRegs()) - setOperationAction(ISD::ROTL, MVT::v32i16, Custom); - for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom); setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom); @@ -5926,8 +5914,7 @@ static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, /// from position Pos and ending in Pos+Size is undef or is zero. static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { - return llvm::all_of(Mask.slice(Pos, Size), - [](int M) { return isUndefOrZero(M); }); + return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero); } /// Helper function to test whether a shuffle mask could be @@ -6788,12 +6775,33 @@ void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, } } +// Attempt to constant fold, else just create a VECTOR_SHUFFLE. +static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, + SDValue V1, SDValue V2, ArrayRef<int> Mask) { + if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) && + (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) { + SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType())); + for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) { + int M = Mask[I]; + if (M < 0) + continue; + SDValue V = (M < NumElts) ? V1 : V2; + if (V.isUndef()) + continue; + Ops[I] = V.getOperand(M % NumElts); + } + return DAG.getBuildVector(VT, dl, Ops); + } + + return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); +} + /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2) { SmallVector<int, 8> Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); - return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); + return getVectorShuffle(DAG, VT, dl, V1, V2, Mask); } /// Returns a vector_shuffle node for an unpackh operation. @@ -6801,12 +6809,11 @@ static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2) { SmallVector<int, 8> Mask; createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); - return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); + return getVectorShuffle(DAG, VT, dl, V1, V2, Mask); } /// Returns a node that packs the LHS + RHS nodes together at half width. /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half. -/// TODO: Add vXi64 -> vXi32 pack support with vector_shuffle node. /// TODO: Add subvector splitting if/when we have a need for it. static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, @@ -6818,9 +6825,24 @@ static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"); - assert((EltSizeInBits == 8 || EltSizeInBits == 16) && + assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && "Unexpected PACK result type"); + // Rely on vector shuffles for vXi64 -> vXi32 packing. + if (EltSizeInBits == 32) { + SmallVector<int> PackMask; + int Offset = PackHiHalf ? 1 : 0; + int NumElts = VT.getVectorNumElements(); + for (int I = 0; I != NumElts; I += 4) { + PackMask.push_back(I + Offset); + PackMask.push_back(I + Offset + 2); + PackMask.push_back(I + Offset + NumElts); + PackMask.push_back(I + Offset + NumElts + 2); + } + return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS), + DAG.getBitcast(VT, RHS), PackMask); + } + // See if we already have sufficient leading bits for PACKSS/PACKUS. if (!PackHiHalf) { if (UsePackUS && @@ -15192,12 +15214,10 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. - int NumFlippedAToBInputs = - std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + - std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); - int NumFlippedBToBInputs = - std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + - std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); + int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) + + llvm::count(AToBInputs, 2 * ADWord + 1); + int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) + + llvm::count(BToBInputs, 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && @@ -25599,6 +25619,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, /// Handle vector element shifts where the shift amount may or may not be a /// constant. Takes immediate version of shift as input. +/// TODO: Replace with vector + (splat) idx to avoid extract_element nodes. static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, const X86Subtarget &Subtarget, @@ -25606,11 +25627,6 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, MVT SVT = ShAmt.getSimpleValueType(); assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!"); - // Catch shift-by-constant. - if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) - return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp, - CShAmt->getZExtValue(), DAG); - // Change opcode to non-immediate version. Opc = getTargetVShiftUniformOpcode(Opc, true); @@ -26342,10 +26358,19 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, DAG.getBitcast(MVT::i16, Ins)); } - case VSHIFT: + case VSHIFT: { + SDValue SrcOp = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + + // Catch shift-by-constant. + if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) + return getTargetVShiftByConstNode(IntrData->Opc0, dl, + Op.getSimpleValueType(), SrcOp, + CShAmt->getZExtValue(), DAG); + return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), - Op.getOperand(1), Op.getOperand(2), Subtarget, - DAG); + SrcOp, ShAmt, Subtarget, DAG); + } case COMPRESS_EXPAND_IN_REG: { SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); @@ -26638,7 +26663,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, X86CC = X86::COND_E; break; } - SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops())); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2); SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG); @@ -26653,7 +26678,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else Opcode = X86ISD::PCMPESTR; - SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops())); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps); } @@ -26666,7 +26691,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else Opcode = X86ISD::PCMPESTR; - SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end()); + SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops())); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1); } @@ -28892,10 +28917,13 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op, // supported by the Subtarget static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { + if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) + return false; + if (VT.getScalarSizeInBits() < 16) return false; - if (VT.is512BitVector() && Subtarget.hasAVX512() && + if (VT.is512BitVector() && Subtarget.useAVX512Regs() && (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) return true; @@ -28919,6 +28947,8 @@ bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget, // natively supported by the Subtarget static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, unsigned Opcode) { + if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) + return false; if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16) return false; @@ -28927,7 +28957,8 @@ static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) return false; - if (Subtarget.hasAVX512()) + if (Subtarget.hasAVX512() && + (Subtarget.useAVX512Regs() || !VT.is512BitVector())) return true; bool LShift = VT.is128BitVector() || VT.is256BitVector(); @@ -28935,8 +28966,8 @@ static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget, return (Opcode == ISD::SRA) ? AShift : LShift; } -static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); @@ -29066,8 +29097,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, return SDValue(); } -static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); SDValue R = Op.getOperand(0); @@ -29166,28 +29197,20 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, (Subtarget.hasBWI() && VT == MVT::v64i8))) return SDValue(); - if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { - SmallVector<SDValue, 8> Elts; - MVT SVT = VT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - APInt One(SVTBits, 1); - unsigned NumElems = VT.getVectorNumElements(); - - for (unsigned i = 0; i != NumElems; ++i) { - SDValue Op = Amt->getOperand(i); - if (Op->isUndef()) { - Elts.push_back(Op); - continue; - } + MVT SVT = VT.getVectorElementType(); + unsigned SVTBits = SVT.getSizeInBits(); + unsigned NumElems = VT.getVectorNumElements(); - ConstantSDNode *ND = cast<ConstantSDNode>(Op); - APInt C(SVTBits, ND->getZExtValue()); - uint64_t ShAmt = C.getZExtValue(); - if (ShAmt >= SVTBits) { - Elts.push_back(DAG.getUNDEF(SVT)); + APInt UndefElts; + SmallVector<APInt> EltBits; + if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) { + APInt One(SVTBits, 1); + SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT)); + for (unsigned I = 0; I != NumElems; ++I) { + if (UndefElts[I] || EltBits[I].uge(SVTBits)) continue; - } - Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT)); + uint64_t ShAmt = EltBits[I].getZExtValue(); + Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT); } return DAG.getBuildVector(VT, dl, Elts); } @@ -29233,10 +29256,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, assert(VT.isVector() && "Custom lowering only for vector shifts!"); assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"); - if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget)) + if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget)) return V; - if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) + if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget)) return V; if (supportedVectorVarShift(VT, Subtarget, Opc)) @@ -29818,14 +29841,29 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt); } - assert(IsROTL && "Only ROTL supported"); + SDValue Z = DAG.getConstant(0, DL, VT); + + if (!IsROTL) { + // If the ISD::ROTR amount is constant, we're always better converting to + // ISD::ROTL. + if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt})) + return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt); + + // XOP targets always prefers ISD::ROTL. + if (Subtarget.hasXOP()) + return DAG.getNode(ISD::ROTL, DL, VT, R, + DAG.getNode(ISD::SUB, DL, VT, Z, Amt)); + } + + // Split 256-bit integers on XOP/pre-AVX2 targets. + if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2())) + return splitVectorIntBinary(Op, DAG); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL. // XOP implicitly uses modulo rotation amounts. if (Subtarget.hasXOP()) { - if (VT.is256BitVector()) - return splitVectorIntBinary(Op, DAG); + assert(IsROTL && "Only ROTL expected"); assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); // Attempt to rotate by immediate. @@ -29839,55 +29877,89 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return Op; } - // Split 256-bit integers on pre-AVX2 targets. - if (VT.is256BitVector() && !Subtarget.hasAVX2()) - return splitVectorIntBinary(Op, DAG); - - assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || - ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8 || - VT == MVT::v32i16) && - Subtarget.hasAVX2())) && - "Only vXi32/vXi16/vXi8 vector rotates supported"); - // Rotate by an uniform constant - expand back to shifts. if (IsCstSplat) return SDValue(); - bool IsSplatAmt = DAG.isSplatValue(Amt); - SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT); + // Split 512-bit integers on non 512-bit BWI targets. + if (VT.is512BitVector() && !Subtarget.useBWIRegs()) + return splitVectorIntBinary(Op, DAG); - // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by - // the amount bit. - if (EltSizeInBits == 8) { - if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) - return SDValue(); + assert( + (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || + ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && + Subtarget.hasAVX2()) || + ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && + "Only vXi32/vXi16/vXi8 vector rotates supported"); - // Check for a hidden ISD::ROTR, vXi8 lowering can handle both, but we - // currently hit infinite loops in legalization if we allow ISD::ROTR. - // FIXME: Infinite ROTL<->ROTR legalization in TargetLowering::expandROT. - SDValue HiddenROTRAmt; - if (Amt.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(Amt.getOperand(0).getNode())) - HiddenROTRAmt = Amt.getOperand(1); + MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); + MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2); - MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2); + SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT); + SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); - // If the amount is a splat, attempt to fold as unpack(x,x) << zext(y): - // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw. - // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))). - if (SDValue BaseRotAmt = DAG.getSplatValue(DAG.getNode( - ISD::AND, DL, VT, HiddenROTRAmt ? HiddenROTRAmt : Amt, AmtMask))) { - unsigned ShiftX86Opc = HiddenROTRAmt ? X86ISD::VSRLI : X86ISD::VSHLI; - BaseRotAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseRotAmt); + // Attempt to fold as unpack(x,x) << zext(splat(y)): + // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw. + // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))). + // TODO: Handle vXi16 cases. + if (EltSizeInBits == 8 || EltSizeInBits == 32) { + if (SDValue BaseRotAmt = DAG.getSplatValue(AmtMod)) { + unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI; SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R)); SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R)); + BaseRotAmt = DAG.getZExtOrTrunc(BaseRotAmt, DL, MVT::i32); Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt, Subtarget, DAG); Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt, Subtarget, DAG); - return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !HiddenROTRAmt); + return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL); + } + } + + // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by + // the amount bit. + // TODO: We're doing nothing here that we couldn't do for funnel shifts. + if (EltSizeInBits == 8) { + bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); + MVT WideVT = + MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts); + unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL; + + // Attempt to fold as: + // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw. + // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))). + if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) && + supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) { + // If we're rotating by constant, just use default promotion. + if (IsConstAmt) + return SDValue(); + // See if we can perform this by widening to vXi16 or vXi32. + R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R); + R = DAG.getNode( + ISD::OR, DL, WideVT, R, + getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG)); + Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod); + R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt); + if (IsROTL) + R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG); + return DAG.getNode(ISD::TRUNCATE, DL, VT, R); } + // Attempt to fold as unpack(x,x) << zext(y): + // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw. + // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))). + if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) { + // See if we can perform this by unpacking to lo/hi vXi16. + SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R)); + SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R)); + SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z)); + SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z)); + SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo); + SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi); + return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL); + } + assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type"); + // We don't need ModuloAmt here as we just peek at individual bits. auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { if (Subtarget.hasSSE41()) { @@ -29907,15 +29979,15 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return DAG.getSelect(DL, SelVT, C, V0, V1); }; - // 'Hidden' ROTR is currently only profitable on AVX512 targets where we - // have VPTERNLOG. - unsigned ShiftLHS = ISD::SHL; - unsigned ShiftRHS = ISD::SRL; - if (HiddenROTRAmt && useVPTERNLOG(Subtarget, VT)) { - std::swap(ShiftLHS, ShiftRHS); - Amt = HiddenROTRAmt; + // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG. + if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) { + Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt); + IsROTL = true; } + unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL; + unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL; + // Turn 'a' into a mask suitable for VSELECT: a = a << 5; // We can safely do this using i16 shifts as we're only interested in // the 3 lower bits of each byte. @@ -29952,18 +30024,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return SignBitSelect(VT, Amt, M, R); } - // ISD::ROT* uses modulo rotate amounts. - if (SDValue BaseRotAmt = DAG.getSplatValue(Amt)) { - // If the amount is a splat, perform the modulo BEFORE the splat, - // this helps LowerScalarVariableShift to remove the splat later. - Amt = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, BaseRotAmt); - Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); - Amt = DAG.getVectorShuffle(VT, DL, Amt, DAG.getUNDEF(VT), - SmallVector<int>(NumElts, 0)); - } else { - Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); - } - + bool IsSplatAmt = DAG.isSplatValue(Amt); bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) && supportedVectorVarShift(VT, Subtarget, ISD::SRL); @@ -29971,13 +30032,25 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, // Fallback for splats + all supported variable shifts. // Fallback for non-constants AVX2 vXi16 as well. if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) { + Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT); AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt); - SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt); - SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR); + SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt); + SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR); return DAG.getNode(ISD::OR, DL, VT, SHL, SRL); } + // Everything below assumes ISD::ROTL. + if (!IsROTL) { + Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt); + IsROTL = true; + } + + // ISD::ROT* uses modulo rotate amounts. + Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); + + assert(IsROTL && "Only ROTL supported"); + // As with shifts, attempt to convert the rotation amount to a multiplication // factor, fallback to general expansion. SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG); @@ -32927,11 +33000,6 @@ bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { unsigned Bits = Ty->getScalarSizeInBits(); - // 8-bit shifts are always expensive, but versions with a scalar amount aren't - // particularly cheaper than those without. - if (Bits == 8) - return false; - // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. if (Subtarget.hasXOP() && @@ -36249,9 +36317,10 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) { Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = MaskEltSize == 16 ? MVT::v8f16 - : !Subtarget.hasSSE2() ? MVT::v4f32 - : MaskVT; + if (MaskEltSize == 16) + SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16); + else + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } } @@ -36300,9 +36369,10 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, isUndefOrEqual(Mask[0], 0) && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = MaskEltSize == 16 ? MVT::v8f16 - : !Subtarget.hasSSE2() ? MVT::v4f32 - : MaskVT; + if (MaskEltSize == 16) + SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16); + else + SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; return true; } @@ -40981,6 +41051,28 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( Op, DemandedBits, DemandedElts, DAG, Depth); } +bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op, + const APInt &DemandedElts, + APInt &UndefElts, + unsigned Depth) const { + unsigned NumElts = DemandedElts.getBitWidth(); + unsigned Opc = Op.getOpcode(); + + switch (Opc) { + case X86ISD::VBROADCAST: + case X86ISD::VBROADCAST_LOAD: + // TODO: Permit vXi64 types on 32-bit targets. + if (isTypeLegal(Op.getValueType().getVectorElementType())) { + UndefElts = APInt::getNullValue(NumElts); + return true; + } + return false; + } + + return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts, + Depth); +} + // Helper to peek through bitops/trunc/setcc to determine size of source vector. // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>. static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, @@ -46204,25 +46296,27 @@ static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + SDLoc dl(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If this is SSE1 only convert to FAND to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { - return DAG.getBitcast( - MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32, - DAG.getBitcast(MVT::v4f32, N->getOperand(0)), - DAG.getBitcast(MVT::v4f32, N->getOperand(1)))); + return DAG.getBitcast(MVT::v4i32, + DAG.getNode(X86ISD::FAND, dl, MVT::v4f32, + DAG.getBitcast(MVT::v4f32, N0), + DAG.getBitcast(MVT::v4f32, N1))); } // Use a 32-bit and+zext if upper bits known zero. - if (VT == MVT::i64 && Subtarget.is64Bit() && - !isa<ConstantSDNode>(N->getOperand(1))) { + if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) { APInt HiMask = APInt::getHighBitsSet(64, 32); - if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) || - DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) { - SDLoc dl(N); - SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0)); - SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1)); + if (DAG.MaskedValueIsZero(N1, HiMask) || + DAG.MaskedValueIsZero(N0, HiMask)) { + SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0); + SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS)); } @@ -46235,8 +46329,6 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SmallVector<APInt, 2> SrcPartials; if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) && SrcOps.size() == 1) { - SDLoc dl(N); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); @@ -46276,33 +46368,57 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) return R; - // Attempt to recursively combine a bitmask AND with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + // Attempt to recursively combine a bitmask AND with shuffles. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; + + // If either operand is a constant mask, then only the elements that aren't + // zero are actually demanded by the other operand. + auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) { + APInt UndefElts; + SmallVector<APInt> EltBits; + int NumElts = VT.getVectorNumElements(); + int EltSizeInBits = VT.getScalarSizeInBits(); + if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) + return false; + + APInt DemandedElts = APInt::getZero(NumElts); + for (int I = 0; I != NumElts; ++I) + if (!EltBits[I].isZero()) + DemandedElts.setBit(I); + + APInt KnownUndef, KnownZero; + return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef, + KnownZero, DCI); + }; + if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); + } } // Attempt to combine a scalar bitmask AND with an extracted shuffle. if ((VT.getScalarSizeInBits() % 8) == 0 && - N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) { - SDValue BitMask = N->getOperand(1); - SDValue SrcVec = N->getOperand(0).getOperand(0); + N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa<ConstantSDNode>(N0.getOperand(1))) { + SDValue BitMask = N1; + SDValue SrcVec = N0.getOperand(0); EVT SrcVecVT = SrcVec.getValueType(); // Check that the constant bitmask masks whole bytes. APInt UndefElts; SmallVector<APInt, 64> EltBits; - if (VT == SrcVecVT.getScalarType() && - N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) && + if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) && getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) && llvm::all_of(EltBits, [](const APInt &M) { return M.isZero() || M.isAllOnes(); })) { unsigned NumElts = SrcVecVT.getVectorNumElements(); unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8; - unsigned Idx = N->getOperand(0).getConstantOperandVal(1); + unsigned Idx = N0.getConstantOperandVal(1); // Create a root shuffle mask from the byte mask and the extracted index. SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef); @@ -46318,8 +46434,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, X86::MaxShuffleCombineDepth, /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true, /*AllowVarPerLaneMask*/ true, DAG, Subtarget)) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, - N->getOperand(0).getOperand(1)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle, + N0.getOperand(1)); } } @@ -46644,11 +46760,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); + SDLoc dl(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If this is SSE1 only convert to FOR to avoid scalarization. if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { return DAG.getBitcast(MVT::v4i32, - DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32, + DAG.getNode(X86ISD::FOR, dl, MVT::v4f32, DAG.getBitcast(MVT::v4f32, N0), DAG.getBitcast(MVT::v4f32, N1))); } @@ -46660,8 +46778,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, SmallVector<APInt, 2> SrcPartials; if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) && SrcOps.size() == 1) { - SDLoc dl(N); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); @@ -46707,7 +46823,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL && N1.getConstantOperandAPInt(1) == HalfElts && DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) { - SDLoc dl(N); return DAG.getNode( ISD::CONCAT_VECTORS, dl, VT, extractSubVector(N0, 0, DAG, dl, HalfElts), @@ -46716,7 +46831,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL && N0.getConstantOperandAPInt(1) == HalfElts && DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) { - SDLoc dl(N); return DAG.getNode( ISD::CONCAT_VECTORS, dl, VT, extractSubVector(N1, 0, DAG, dl, HalfElts), @@ -46724,11 +46838,36 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, } } - // Attempt to recursively combine an OR of shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { + // Attempt to recursively combine an OR of shuffles. SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) return Res; + + // If either operand is a constant mask, then only the elements that aren't + // allones are actually demanded by the other operand. + auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) { + APInt UndefElts; + SmallVector<APInt> EltBits; + int NumElts = VT.getVectorNumElements(); + int EltSizeInBits = VT.getScalarSizeInBits(); + if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) + return false; + + APInt DemandedElts = APInt::getZero(NumElts); + for (int I = 0; I != NumElts; ++I) + if (!EltBits[I].isAllOnes()) + DemandedElts.setBit(I); + + APInt KnownUndef, KnownZero; + return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, KnownUndef, + KnownZero, DCI); + }; + if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) { + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); + } } // We should fold "masked merge" patterns when `andn` is not available. @@ -52111,7 +52250,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, case X86ISD::VSHLI: case X86ISD::VSRLI: // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle. - // TODO: Move this to LowerScalarImmediateShift? + // TODO: Move this to LowerShiftByScalarImmediate? if (VT == MVT::v4i64 && !Subtarget.hasInt256() && llvm::all_of(Ops, [](SDValue Op) { return Op.getConstantOperandAPInt(1) == 32; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6805cb75f0f2..d1d6e319f16b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1100,6 +1100,12 @@ namespace llvm { bool shouldSplatInsEltVarIndex(EVT VT) const override; + bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override { + // Converting to sat variants holds little benefit on X86 as we will just + // need to saturate the value back using fp arithmatic. + return Op != ISD::FP_TO_UINT_SAT && isOperationLegalOrCustom(Op, VT); + } + bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); } @@ -1153,6 +1159,10 @@ namespace llvm { SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override; + bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, + APInt &UndefElts, + unsigned Depth) const override; + const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override; SDValue unwrapAddress(SDValue N) const override; diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp index 732b2b1a5ada..6642f46e64b2 100644 --- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -137,8 +137,10 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { Changed |= addENDBR(MBB, MBB.begin()); for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { - if (I->isCall() && IsCallReturnTwice(I->getOperand(0))) + if (I->isCall() && I->getNumOperands() > 0 && + IsCallReturnTwice(I->getOperand(0))) { Changed |= addENDBR(MBB, std::next(I)); + } } // Exception handle may indirectly jump to catch pad, So we should add diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1db83033ba35..ecd4777c3533 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -9958,74 +9958,74 @@ multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode, } defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", - WriteShuffle256, truncstorevi8, + WriteVPMOV256, truncstorevi8, masked_truncstorevi8, X86vtrunc, X86vmtrunc>; defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", - WriteShuffle256, truncstore_s_vi8, + WriteVPMOV256, truncstore_s_vi8, masked_truncstore_s_vi8, X86vtruncs, X86vmtruncs>; defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", - WriteShuffle256, truncstore_us_vi8, + WriteVPMOV256, truncstore_us_vi8, masked_truncstore_us_vi8, X86vtruncus, X86vmtruncus>; defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", trunc, select_trunc, - WriteShuffle256, truncstorevi16, + WriteVPMOV256, truncstorevi16, masked_truncstorevi16, X86vtrunc, X86vmtrunc>; defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, select_truncs, - WriteShuffle256, truncstore_s_vi16, + WriteVPMOV256, truncstore_s_vi16, masked_truncstore_s_vi16, X86vtruncs, X86vmtruncs>; defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, - select_truncus, WriteShuffle256, + select_truncus, WriteVPMOV256, truncstore_us_vi16, masked_truncstore_us_vi16, X86vtruncus, X86vmtruncus>; defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", trunc, select_trunc, - WriteShuffle256, truncstorevi32, + WriteVPMOV256, truncstorevi32, masked_truncstorevi32, X86vtrunc, X86vmtrunc>; defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, select_truncs, - WriteShuffle256, truncstore_s_vi32, + WriteVPMOV256, truncstore_s_vi32, masked_truncstore_s_vi32, X86vtruncs, X86vmtruncs>; defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, - select_truncus, WriteShuffle256, + select_truncus, WriteVPMOV256, truncstore_us_vi32, masked_truncstore_us_vi32, X86vtruncus, X86vmtruncus>; defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", trunc, select_trunc, - WriteShuffle256, truncstorevi8, + WriteVPMOV256, truncstorevi8, masked_truncstorevi8, X86vtrunc, X86vmtrunc>; defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, select_truncs, - WriteShuffle256, truncstore_s_vi8, + WriteVPMOV256, truncstore_s_vi8, masked_truncstore_s_vi8, X86vtruncs, X86vmtruncs>; defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, - select_truncus, WriteShuffle256, + select_truncus, WriteVPMOV256, truncstore_us_vi8, masked_truncstore_us_vi8, X86vtruncus, X86vmtruncus>; defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", trunc, select_trunc, - WriteShuffle256, truncstorevi16, + WriteVPMOV256, truncstorevi16, masked_truncstorevi16, X86vtrunc, X86vmtrunc>; defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, select_truncs, - WriteShuffle256, truncstore_s_vi16, + WriteVPMOV256, truncstore_s_vi16, masked_truncstore_s_vi16, X86vtruncs, X86vmtruncs>; defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, - select_truncus, WriteShuffle256, + select_truncus, WriteVPMOV256, truncstore_us_vi16, masked_truncstore_us_vi16, X86vtruncus, X86vmtruncus>; defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", trunc, select_trunc, - WriteShuffle256, truncstorevi8, + WriteVPMOV256, truncstorevi8, masked_truncstorevi8, X86vtrunc, X86vmtrunc>; defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, select_truncs, - WriteShuffle256, truncstore_s_vi8, + WriteVPMOV256, truncstore_s_vi8, masked_truncstore_s_vi8, X86vtruncs, X86vmtruncs>; defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, - select_truncus, WriteShuffle256, + select_truncus, WriteVPMOV256, truncstore_us_vi8, masked_truncstore_us_vi8, X86vtruncus, X86vmtruncus>; @@ -10084,7 +10084,7 @@ defm : mtrunc_lowering<"VPMOVSQWZ", X86vmtruncs, v8i16x_info, v8i64_info>; defm : mtrunc_lowering<"VPMOVUSQWZ", X86vmtruncus, v8i16x_info, v8i64_info>; } -multiclass WriteShuffle256_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, +multiclass avx512_pmovx_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{ let ExeDomain = DestInfo.ExeDomain in { @@ -10100,135 +10100,140 @@ multiclass WriteShuffle256_common<bits<8> opc, string OpcodeStr, X86FoldableSche } } -multiclass WriteShuffle256_BW<bits<8> opc, string OpcodeStr, +multiclass avx512_pmovx_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode InVecNode, string ExtTy, - X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ, + PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasBWI] in { - defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v8i16x_info, + defm Z128: avx512_pmovx_common<opc, OpcodeStr, schedX, v8i16x_info, v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v16i16x_info, + defm Z256: avx512_pmovx_common<opc, OpcodeStr, schedYZ, v16i16x_info, v16i8x_info, i128mem, LdFrag, OpNode>, EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasBWI] in { - defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v32i16_info, + defm Z : avx512_pmovx_common<opc, OpcodeStr, schedYZ, v32i16_info, v32i8x_info, i256mem, LdFrag, OpNode>, EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG; } } -multiclass WriteShuffle256_BD<bits<8> opc, string OpcodeStr, +multiclass avx512_pmovx_bd<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode InVecNode, string ExtTy, - X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ, + PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info, + defm Z128: avx512_pmovx_common<opc, OpcodeStr, schedX, v4i32x_info, v16i8x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info, + defm Z256: avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i32x_info, v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info, + defm Z : avx512_pmovx_common<opc, OpcodeStr, schedYZ, v16i32_info, v16i8x_info, i128mem, LdFrag, OpNode>, EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG; } } -multiclass WriteShuffle256_BQ<bits<8> opc, string OpcodeStr, +multiclass avx512_pmovx_bq<bits<8> opc, string OpcodeStr, SDNode InVecNode, string ExtTy, - X86FoldableSchedWrite sched, + X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info, + defm Z128: avx512_pmovx_common<opc, OpcodeStr, schedX, v2i64x_info, v16i8x_info, i16mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info, + defm Z256: avx512_pmovx_common<opc, OpcodeStr, schedYZ, v4i64x_info, v16i8x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info, + defm Z : avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i64_info, v16i8x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG; } } -multiclass WriteShuffle256_WD<bits<8> opc, string OpcodeStr, +multiclass avx512_pmovx_wd<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode InVecNode, string ExtTy, - X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ, + PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info, + defm Z128: avx512_pmovx_common<opc, OpcodeStr, schedX, v4i32x_info, v8i16x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info, + defm Z256: avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i32x_info, v8i16x_info, i128mem, LdFrag, OpNode>, EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info, + defm Z : avx512_pmovx_common<opc, OpcodeStr, schedYZ, v16i32_info, v16i16x_info, i256mem, LdFrag, OpNode>, EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG; } } -multiclass WriteShuffle256_WQ<bits<8> opc, string OpcodeStr, +multiclass avx512_pmovx_wq<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode InVecNode, string ExtTy, - X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ, + PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info, + defm Z128: avx512_pmovx_common<opc, OpcodeStr, schedX, v2i64x_info, v8i16x_info, i32mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info, + defm Z256: avx512_pmovx_common<opc, OpcodeStr, schedYZ, v4i64x_info, v8i16x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info, + defm Z : avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i64_info, v8i16x_info, i128mem, LdFrag, OpNode>, EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG; } } -multiclass WriteShuffle256_DQ<bits<8> opc, string OpcodeStr, +multiclass avx512_pmovx_dq<bits<8> opc, string OpcodeStr, SDNode OpNode, SDNode InVecNode, string ExtTy, - X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> { + X86FoldableSchedWrite schedX, X86FoldableSchedWrite schedYZ, + PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info, + defm Z128: avx512_pmovx_common<opc, OpcodeStr, schedX, v2i64x_info, v4i32x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128; - defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info, + defm Z256: avx512_pmovx_common<opc, OpcodeStr, schedYZ, v4i64x_info, v4i32x_info, i128mem, LdFrag, OpNode>, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256; } let Predicates = [HasAVX512] in { - defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info, + defm Z : avx512_pmovx_common<opc, OpcodeStr, schedYZ, v8i64_info, v8i32x_info, i256mem, LdFrag, OpNode>, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512; } } -defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", zext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", zext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", zext_invec, "z", WriteShuffle256>; -defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", zext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", zext, zext_invec, "z", WriteShuffle256>; -defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", zext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBW : avx512_pmovx_bw<0x30, "vpmovzxbw", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVZXBD : avx512_pmovx_bd<0x31, "vpmovzxbd", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVZXBQ : avx512_pmovx_bq<0x32, "vpmovzxbq", zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVZXWD : avx512_pmovx_wd<0x33, "vpmovzxwd", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVZXWQ : avx512_pmovx_wq<0x34, "vpmovzxwq", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVZXDQ : avx512_pmovx_dq<0x35, "vpmovzxdq", zext, zext_invec, "z", SchedWriteShuffle.XMM, WriteVPMOV256>; -defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", sext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", sext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", sext_invec, "s", WriteShuffle256>; -defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", sext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", sext, sext_invec, "s", WriteShuffle256>; -defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", sext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBW: avx512_pmovx_bw<0x20, "vpmovsxbw", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVSXBD: avx512_pmovx_bd<0x21, "vpmovsxbd", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVSXBQ: avx512_pmovx_bq<0x22, "vpmovsxbq", sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVSXWD: avx512_pmovx_wd<0x23, "vpmovsxwd", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVSXWQ: avx512_pmovx_wq<0x24, "vpmovsxwq", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>; +defm VPMOVSXDQ: avx512_pmovx_dq<0x25, "vpmovsxdq", sext, sext_invec, "s", SchedWriteShuffle.XMM, WriteVPMOV256>; // Patterns that we also need any extend versions of. aext_vector_inreg @@ -10523,21 +10528,22 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { +multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr, SchedWrite Sched> { def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), !strconcat(OpcodeStr#Vec.Suffix, "\t{$src, $dst|$dst, $src}"), [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>, - EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc? + EVEX, Sched<[Sched]>; } multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, string OpcodeStr, Predicate prd> { +// TODO - Replace WriteMove with WriteVecTrunc? let Predicates = [prd] in - defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr, WriteMove>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; - defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr, WriteMove>, EVEX_V256; + defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr, WriteMove>, EVEX_V128; } } diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index ba52283b570d..7288ce812138 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -260,10 +260,10 @@ let isPseudo = 1, SchedRW = [WriteSystem] in { // Pseudo instructions used by address sanitizer. //===----------------------------------------------------------------------===// let - Defs = [R8, EFLAGS] in { + Defs = [R10, R11, EFLAGS] in { def ASAN_CHECK_MEMACCESS : PseudoI< - (outs), (ins GR64NoR8:$addr, i32imm:$accessinfo), - [(int_asan_check_memaccess GR64NoR8:$addr, (i32 timm:$accessinfo))]>, + (outs), (ins GR64PLTSafe:$addr, i32imm:$accessinfo), + [(int_asan_check_memaccess GR64PLTSafe:$addr, (i32 timm:$accessinfo))]>, Sched<[]>; } diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 6d4ad08842c7..226349485238 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -529,11 +529,11 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = { { X86::LZCNT16rr, X86::LZCNT16rm, 0 }, { X86::LZCNT32rr, X86::LZCNT32rm, 0 }, { X86::LZCNT64rr, X86::LZCNT64rm, 0 }, - { X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, TB_ALIGN_16 }, - { X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 }, - { X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, TB_NO_REVERSE }, - { X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, TB_ALIGN_16 }, - { X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, TB_NO_REVERSE }, + { X86::MMX_CVTPD2PIrr, X86::MMX_CVTPD2PIrm, TB_ALIGN_16 }, + { X86::MMX_CVTPI2PDrr, X86::MMX_CVTPI2PDrm, 0 }, + { X86::MMX_CVTPS2PIrr, X86::MMX_CVTPS2PIrm, TB_NO_REVERSE }, + { X86::MMX_CVTTPD2PIrr, X86::MMX_CVTTPD2PIrm, TB_ALIGN_16 }, + { X86::MMX_CVTTPS2PIrr, X86::MMX_CVTTPS2PIrm, TB_NO_REVERSE }, { X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 }, { X86::MMX_PABSBrr, X86::MMX_PABSBrm, 0 }, { X86::MMX_PABSDrr, X86::MMX_PABSDrm, 0 }, @@ -1339,29 +1339,29 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE }, { X86::MINSSrr, X86::MINSSrm, 0 }, { X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE }, - { X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 }, - { X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 }, - { X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 }, - { X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 }, - { X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 }, - { X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 }, - { X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 }, - { X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 }, - { X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 }, - { X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 }, - { X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 }, - { X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 }, + { X86::MMX_CVTPI2PSrr, X86::MMX_CVTPI2PSrm, 0 }, + { X86::MMX_PACKSSDWrr, X86::MMX_PACKSSDWrm, 0 }, + { X86::MMX_PACKSSWBrr, X86::MMX_PACKSSWBrm, 0 }, + { X86::MMX_PACKUSWBrr, X86::MMX_PACKUSWBrm, 0 }, + { X86::MMX_PADDBrr, X86::MMX_PADDBrm, 0 }, + { X86::MMX_PADDDrr, X86::MMX_PADDDrm, 0 }, + { X86::MMX_PADDQrr, X86::MMX_PADDQrm, 0 }, + { X86::MMX_PADDSBrr, X86::MMX_PADDSBrm, 0 }, + { X86::MMX_PADDSWrr, X86::MMX_PADDSWrm, 0 }, + { X86::MMX_PADDUSBrr, X86::MMX_PADDUSBrm, 0 }, + { X86::MMX_PADDUSWrr, X86::MMX_PADDUSWrm, 0 }, + { X86::MMX_PADDWrr, X86::MMX_PADDWrm, 0 }, { X86::MMX_PALIGNRrri, X86::MMX_PALIGNRrmi, 0 }, - { X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 }, - { X86::MMX_PANDirr, X86::MMX_PANDirm, 0 }, - { X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 }, - { X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 }, - { X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 }, - { X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 }, - { X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 }, - { X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 }, - { X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 }, - { X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 }, + { X86::MMX_PANDNrr, X86::MMX_PANDNrm, 0 }, + { X86::MMX_PANDrr, X86::MMX_PANDrm, 0 }, + { X86::MMX_PAVGBrr, X86::MMX_PAVGBrm, 0 }, + { X86::MMX_PAVGWrr, X86::MMX_PAVGWrm, 0 }, + { X86::MMX_PCMPEQBrr, X86::MMX_PCMPEQBrm, 0 }, + { X86::MMX_PCMPEQDrr, X86::MMX_PCMPEQDrm, 0 }, + { X86::MMX_PCMPEQWrr, X86::MMX_PCMPEQWrm, 0 }, + { X86::MMX_PCMPGTBrr, X86::MMX_PCMPGTBrm, 0 }, + { X86::MMX_PCMPGTDrr, X86::MMX_PCMPGTDrm, 0 }, + { X86::MMX_PCMPGTWrr, X86::MMX_PCMPGTWrm, 0 }, { X86::MMX_PHADDDrr, X86::MMX_PHADDDrm, 0 }, { X86::MMX_PHADDSWrr, X86::MMX_PHADDSWrm, 0 }, { X86::MMX_PHADDWrr, X86::MMX_PHADDWrm, 0 }, @@ -1370,18 +1370,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::MMX_PHSUBWrr, X86::MMX_PHSUBWrm, 0 }, { X86::MMX_PINSRWrr, X86::MMX_PINSRWrm, TB_NO_REVERSE }, { X86::MMX_PMADDUBSWrr, X86::MMX_PMADDUBSWrm, 0 }, - { X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 }, - { X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 }, - { X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 }, - { X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 }, - { X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 }, + { X86::MMX_PMADDWDrr, X86::MMX_PMADDWDrm, 0 }, + { X86::MMX_PMAXSWrr, X86::MMX_PMAXSWrm, 0 }, + { X86::MMX_PMAXUBrr, X86::MMX_PMAXUBrm, 0 }, + { X86::MMX_PMINSWrr, X86::MMX_PMINSWrm, 0 }, + { X86::MMX_PMINUBrr, X86::MMX_PMINUBrm, 0 }, { X86::MMX_PMULHRSWrr, X86::MMX_PMULHRSWrm, 0 }, - { X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 }, - { X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 }, - { X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 }, - { X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 }, - { X86::MMX_PORirr, X86::MMX_PORirm, 0 }, - { X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 }, + { X86::MMX_PMULHUWrr, X86::MMX_PMULHUWrm, 0 }, + { X86::MMX_PMULHWrr, X86::MMX_PMULHWrm, 0 }, + { X86::MMX_PMULLWrr, X86::MMX_PMULLWrm, 0 }, + { X86::MMX_PMULUDQrr, X86::MMX_PMULUDQrm, 0 }, + { X86::MMX_PORrr, X86::MMX_PORrm, 0 }, + { X86::MMX_PSADBWrr, X86::MMX_PSADBWrm, 0 }, { X86::MMX_PSHUFBrr, X86::MMX_PSHUFBrm, 0 }, { X86::MMX_PSIGNBrr, X86::MMX_PSIGNBrm, 0 }, { X86::MMX_PSIGNDrr, X86::MMX_PSIGNDrm, 0 }, @@ -1394,21 +1394,21 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = { { X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 }, { X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 }, { X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 }, - { X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 }, - { X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 }, - { X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 }, - { X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 }, - { X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 }, - { X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 }, - { X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 }, - { X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 }, - { X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 }, - { X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 }, - { X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 }, - { X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, TB_NO_REVERSE }, - { X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, TB_NO_REVERSE }, - { X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, TB_NO_REVERSE }, - { X86::MMX_PXORirr, X86::MMX_PXORirm, 0 }, + { X86::MMX_PSUBBrr, X86::MMX_PSUBBrm, 0 }, + { X86::MMX_PSUBDrr, X86::MMX_PSUBDrm, 0 }, + { X86::MMX_PSUBQrr, X86::MMX_PSUBQrm, 0 }, + { X86::MMX_PSUBSBrr, X86::MMX_PSUBSBrm, 0 }, + { X86::MMX_PSUBSWrr, X86::MMX_PSUBSWrm, 0 }, + { X86::MMX_PSUBUSBrr, X86::MMX_PSUBUSBrm, 0 }, + { X86::MMX_PSUBUSWrr, X86::MMX_PSUBUSWrm, 0 }, + { X86::MMX_PSUBWrr, X86::MMX_PSUBWrm, 0 }, + { X86::MMX_PUNPCKHBWrr, X86::MMX_PUNPCKHBWrm, 0 }, + { X86::MMX_PUNPCKHDQrr, X86::MMX_PUNPCKHDQrm, 0 }, + { X86::MMX_PUNPCKHWDrr, X86::MMX_PUNPCKHWDrm, 0 }, + { X86::MMX_PUNPCKLBWrr, X86::MMX_PUNPCKLBWrm, TB_NO_REVERSE }, + { X86::MMX_PUNPCKLDQrr, X86::MMX_PUNPCKLDQrm, TB_NO_REVERSE }, + { X86::MMX_PUNPCKLWDrr, X86::MMX_PUNPCKLWDrm, TB_NO_REVERSE }, + { X86::MMX_PXORrr, X86::MMX_PXORrm, 0 }, { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, { X86::MOVSDrr, X86::MOVLPDrm, TB_NO_REVERSE }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index bb5637a31947..c379aa8d9258 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4088,8 +4088,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, Register SrcReg, Register SrcReg2, int64_t ImmMask, int64_t ImmValue, - const MachineInstr &OI, bool *IsSwapped, - int64_t *ImmDelta) const { + const MachineInstr &OI, + bool *IsSwapped) const { switch (OI.getOpcode()) { case X86::CMP64rr: case X86::CMP32rr: @@ -4140,21 +4140,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, int64_t OIMask; int64_t OIValue; if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) && - SrcReg == OISrcReg && ImmMask == OIMask) { - if (OIValue == ImmValue) { - *ImmDelta = 0; - return true; - } else if (static_cast<uint64_t>(ImmValue) == - static_cast<uint64_t>(OIValue) - 1) { - *ImmDelta = -1; - return true; - } else if (static_cast<uint64_t>(ImmValue) == - static_cast<uint64_t>(OIValue) + 1) { - *ImmDelta = 1; - return true; - } else { - return false; - } + SrcReg == OISrcReg && ImmMask == OIMask && OIValue == ImmValue) { + assert(SrcReg2 == X86::NoRegister && OISrcReg2 == X86::NoRegister && + "should not have 2nd register"); + return true; } } return FlagI.isIdenticalTo(OI); @@ -4404,7 +4393,6 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, bool ShouldUpdateCC = false; bool IsSwapped = false; X86::CondCode NewCC = X86::COND_INVALID; - int64_t ImmDelta = 0; // Search backward from CmpInstr for the next instruction defining EFLAGS. const TargetRegisterInfo *TRI = &getRegisterInfo(); @@ -4451,7 +4439,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // ... // EFLAGS not changed // cmp x, y // <-- can be removed if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue, - Inst, &IsSwapped, &ImmDelta)) { + Inst, &IsSwapped)) { Sub = &Inst; break; } @@ -4485,7 +4473,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // It is safe to remove CmpInstr if EFLAGS is redefined or killed. // If we are done with the basic block, we need to check whether EFLAGS is // live-out. - bool FlagsMayLiveOut = true; + bool IsSafe = false; SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate; MachineBasicBlock::iterator AfterCmpInstr = std::next(MachineBasicBlock::iterator(CmpInstr)); @@ -4495,7 +4483,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // We should check the usage if this instruction uses and updates EFLAGS. if (!UseEFLAGS && ModifyEFLAGS) { // It is safe to remove CmpInstr if EFLAGS is updated again. - FlagsMayLiveOut = false; + IsSafe = true; break; } if (!UseEFLAGS && !ModifyEFLAGS) @@ -4503,7 +4491,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // EFLAGS is used by this instruction. X86::CondCode OldCC = X86::COND_INVALID; - if (MI || IsSwapped || ImmDelta != 0) { + if (MI || IsSwapped) { // We decode the condition code from opcode. if (Instr.isBranch()) OldCC = X86::getCondFromBranch(Instr); @@ -4555,60 +4543,11 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. // We swap the condition code and synthesize the new opcode. ReplacementCC = getSwappedCondition(OldCC); - if (ReplacementCC == X86::COND_INVALID) return false; - ShouldUpdateCC = true; - } else if (ImmDelta != 0) { - unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg)); - // Shift amount for min/max constants to adjust for 8/16/32 instruction - // sizes. - switch (OldCC) { - case X86::COND_L: // x <s (C + 1) --> x <=s C - if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue) - return false; - ReplacementCC = X86::COND_LE; - break; - case X86::COND_B: // x <u (C + 1) --> x <=u C - if (ImmDelta != 1 || CmpValue == 0) - return false; - ReplacementCC = X86::COND_BE; - break; - case X86::COND_GE: // x >=s (C + 1) --> x >s C - if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue) - return false; - ReplacementCC = X86::COND_G; - break; - case X86::COND_AE: // x >=u (C + 1) --> x >u C - if (ImmDelta != 1 || CmpValue == 0) - return false; - ReplacementCC = X86::COND_A; - break; - case X86::COND_G: // x >s (C - 1) --> x >=s C - if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue) - return false; - ReplacementCC = X86::COND_GE; - break; - case X86::COND_A: // x >u (C - 1) --> x >=u C - if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue) - return false; - ReplacementCC = X86::COND_AE; - break; - case X86::COND_LE: // x <=s (C - 1) --> x <s C - if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue) - return false; - ReplacementCC = X86::COND_L; - break; - case X86::COND_BE: // x <=u (C - 1) --> x <u C - if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue) - return false; - ReplacementCC = X86::COND_B; - break; - default: + if (ReplacementCC == X86::COND_INVALID) return false; - } - ShouldUpdateCC = true; } - if (ShouldUpdateCC && ReplacementCC != OldCC) { + if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) { // Push the MachineInstr to OpsToUpdate. // If it is safe to remove CmpInstr, the condition code of these // instructions will be modified. @@ -4616,14 +4555,14 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, } if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { // It is safe to remove CmpInstr if EFLAGS is updated again or killed. - FlagsMayLiveOut = false; + IsSafe = true; break; } } - // If we have to update users but EFLAGS is live-out abort, since we cannot - // easily find all of the users. - if (ShouldUpdateCC && FlagsMayLiveOut) { + // If EFLAGS is not killed nor re-defined, we should check whether it is + // live-out. If it is live-out, do not optimize. + if ((MI || IsSwapped) && !IsSafe) { for (MachineBasicBlock *Successor : CmpMBB.successors()) if (Successor->isLiveIn(X86::EFLAGS)) return false; @@ -4944,7 +4883,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::SETB_C64r: return Expand2AddrUndef(MIB, get(X86::SBB64rr)); case X86::MMX_SET0: - return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr)); + return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr)); case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: @@ -5217,12 +5156,12 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum, bool ForLoadFold = false) { // Set the OpNum parameter to the first source operand. switch (Opcode) { - case X86::MMX_PUNPCKHBWirr: - case X86::MMX_PUNPCKHWDirr: - case X86::MMX_PUNPCKHDQirr: - case X86::MMX_PUNPCKLBWirr: - case X86::MMX_PUNPCKLWDirr: - case X86::MMX_PUNPCKLDQirr: + case X86::MMX_PUNPCKHBWrr: + case X86::MMX_PUNPCKHWDrr: + case X86::MMX_PUNPCKHDQrr: + case X86::MMX_PUNPCKLBWrr: + case X86::MMX_PUNPCKLWDrr: + case X86::MMX_PUNPCKLDQrr: case X86::MOVHLPSrr: case X86::PACKSSWBrr: case X86::PACKUSWBrr: diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 33ce55bbdb2b..537ada6222bf 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -643,8 +643,7 @@ private: /// CMP %1, %2 and %3 = SUB %2, %1 ; IsSwapped=true bool isRedundantFlagInstr(const MachineInstr &FlagI, Register SrcReg, Register SrcReg2, int64_t ImmMask, int64_t ImmValue, - const MachineInstr &OI, bool *IsSwapped, - int64_t *ImmDelta) const; + const MachineInstr &OI, bool *IsSwapped) const; }; } // namespace llvm diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td index bb3e6df3bf3e..aeecc25ddea2 100644 --- a/llvm/lib/Target/X86/X86InstrMMX.td +++ b/llvm/lib/Target/X86/X86InstrMMX.td @@ -34,14 +34,14 @@ let Constraints = "$src1 = $dst" in { multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, X86FoldableSchedWrite sched, bit Commutable = 0, X86MemOperand OType = i64mem> { - def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, VR64:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>, Sched<[sched]> { let isCommutable = Commutable; } - def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), (ins VR64:$src1, OType:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>, @@ -123,25 +123,25 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId, multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, X86FoldableSchedWrite sched, Domain d> { - def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, - [(set DstRC:$dst, (Int SrcRC:$src))], d>, - Sched<[sched]>; - def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, - [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>, - Sched<[sched.Folded]>; + def rr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (Int SrcRC:$src))], d>, + Sched<[sched]>; + def rm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>, + Sched<[sched.Folded]>; } multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d> { - def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), - (ins DstRC:$src1, SrcRC:$src2), asm, - [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>, - Sched<[WriteCvtI2PS]>; - def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), - (ins DstRC:$src1, x86memop:$src2), asm, - [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>, - Sched<[WriteCvtI2PS.Folded]>; + def rr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), + (ins DstRC:$src1, SrcRC:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>, + Sched<[WriteCvtI2PS]>; + def rm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>, + Sched<[WriteCvtI2PS.Folded]>; } //===----------------------------------------------------------------------===// @@ -569,14 +569,14 @@ def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), (MMX_MOVFR642Qrr FR64:$src)>; def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvtp2Int (v4f32 VR128:$src)))))), - (MMX_CVTPS2PIirr VR128:$src)>; + (MMX_CVTPS2PIrr VR128:$src)>; def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))), - (MMX_CVTTPS2PIirr VR128:$src)>; + (MMX_CVTTPS2PIrr VR128:$src)>; def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), - (MMX_CVTPD2PIirr VR128:$src)>; + (MMX_CVTPD2PIrr VR128:$src)>; def : Pat<(x86mmx (MMX_X86movdq2q (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), - (MMX_CVTTPD2PIirr VR128:$src)>; + (MMX_CVTTPD2PIrr VR128:$src)>; } diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index c3cd634612a4..9044f10ec630 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -48,6 +48,7 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Instrumentation/AddressSanitizer.h" #include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h" +#include <string> using namespace llvm; @@ -1336,235 +1337,29 @@ void X86AsmPrinter::LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI) { return; } - unsigned Reg = MI.getOperand(0).getReg().id(); + const auto &Reg = MI.getOperand(0).getReg(); ASanAccessInfo AccessInfo(MI.getOperand(1).getImm()); - MCSymbol *&Sym = - AsanMemaccessSymbols[AsanMemaccessTuple(Reg, AccessInfo.Packed)]; - if (!Sym) { - std::string Name = AccessInfo.IsWrite ? "store" : "load"; - std::string SymName = "__asan_check_" + Name + - utostr(1ULL << AccessInfo.AccessSizeIndex) + "_rn" + - utostr(Reg); - Sym = OutContext.getOrCreateSymbol(SymName); - } - - EmitAndCountInstruction( - MCInstBuilder(X86::CALL64pcrel32) - .addExpr(MCSymbolRefExpr::create(Sym, OutContext))); -} - -void X86AsmPrinter::emitAsanMemaccessPartial(Module &M, unsigned Reg, - const ASanAccessInfo &AccessInfo, - MCSubtargetInfo &STI) { - assert(AccessInfo.AccessSizeIndex == 0 || AccessInfo.AccessSizeIndex == 1 || - AccessInfo.AccessSizeIndex == 2); - assert(Reg != X86::R8); - uint64_t ShadowBase; int MappingScale; bool OrShadowOffset; - getAddressSanitizerParams( - Triple(M.getTargetTriple()), M.getDataLayout().getPointerSizeInBits(), - AccessInfo.CompileKernel, &ShadowBase, &MappingScale, &OrShadowOffset); - - OutStreamer->emitInstruction( - MCInstBuilder(X86::MOV64rr).addReg(X86::R8).addReg(X86::NoRegister + Reg), - STI); - OutStreamer->emitInstruction(MCInstBuilder(X86::SHR64ri) - .addReg(X86::R8) - .addReg(X86::R8) - .addImm(MappingScale), - STI); - if (OrShadowOffset) { - OutStreamer->emitInstruction(MCInstBuilder(X86::OR64ri32) - .addReg(X86::R8) - .addReg(X86::R8) - .addImm(ShadowBase), - STI); - OutStreamer->emitInstruction(MCInstBuilder(X86::MOV8rm) - .addReg(X86::R8B) - .addReg(X86::R8) - .addImm(1) - .addReg(X86::NoRegister) - .addImm(0) - .addReg(X86::NoRegister), - STI); - OutStreamer->emitInstruction( - MCInstBuilder(X86::TEST8rr).addReg(X86::R8B).addReg(X86::R8B), STI); - } else { - OutStreamer->emitInstruction(MCInstBuilder(X86::MOVSX32rm8) - .addReg(X86::R8D) - .addReg(X86::R8) - .addImm(1) - .addReg(X86::NoRegister) - .addImm(ShadowBase) - .addReg(X86::NoRegister), - STI); - OutStreamer->emitInstruction( - MCInstBuilder(X86::TEST32rr).addReg(X86::R8D).addReg(X86::R8D), STI); - } - MCSymbol *AdditionalCheck = OutContext.createTempSymbol(); - OutStreamer->emitInstruction( - MCInstBuilder(X86::JCC_1) - .addExpr(MCSymbolRefExpr::create(AdditionalCheck, OutContext)) - .addImm(X86::COND_NE), - STI); - MCSymbol *ReturnSym = OutContext.createTempSymbol(); - OutStreamer->emitLabel(ReturnSym); - OutStreamer->emitInstruction(MCInstBuilder(getRetOpcode(*Subtarget)), STI); - - // Shadow byte is non-zero so we need to perform additional checks. - OutStreamer->emitLabel(AdditionalCheck); - OutStreamer->emitInstruction(MCInstBuilder(X86::PUSH64r).addReg(X86::RCX), - STI); - OutStreamer->emitInstruction(MCInstBuilder(X86::MOV64rr) - .addReg(X86::RCX) - .addReg(X86::NoRegister + Reg), - STI); - const size_t Granularity = 1ULL << MappingScale; - OutStreamer->emitInstruction(MCInstBuilder(X86::AND32ri8) - .addReg(X86::NoRegister) - .addReg(X86::ECX) - .addImm(Granularity - 1), - STI); - if (AccessInfo.AccessSizeIndex == 1) { - OutStreamer->emitInstruction(MCInstBuilder(X86::ADD32ri8) - .addReg(X86::NoRegister) - .addReg(X86::ECX) - .addImm(1), - STI); - } else if (AccessInfo.AccessSizeIndex == 2) { - OutStreamer->emitInstruction(MCInstBuilder(X86::ADD32ri8) - .addReg(X86::NoRegister) - .addReg(X86::ECX) - .addImm(3), - STI); - } - - OutStreamer->emitInstruction( - MCInstBuilder(X86::CMP32rr).addReg(X86::ECX).addReg(X86::R8D).addImm(1), - STI); - OutStreamer->emitInstruction(MCInstBuilder(X86::POP64r).addReg(X86::RCX), - STI); - OutStreamer->emitInstruction( - MCInstBuilder(X86::JCC_1) - .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)) - .addImm(X86::COND_L), - STI); - - emitAsanReportError(M, Reg, AccessInfo, STI); -} - -void X86AsmPrinter::emitAsanMemaccessFull(Module &M, unsigned Reg, - const ASanAccessInfo &AccessInfo, - MCSubtargetInfo &STI) { - assert(AccessInfo.AccessSizeIndex == 3 || AccessInfo.AccessSizeIndex == 4); - assert(Reg != X86::R8); - - uint64_t ShadowBase; - int MappingScale; - bool OrShadowOffset; - getAddressSanitizerParams( - Triple(M.getTargetTriple()), M.getDataLayout().getPointerSizeInBits(), - AccessInfo.CompileKernel, &ShadowBase, &MappingScale, &OrShadowOffset); - - OutStreamer->emitInstruction( - MCInstBuilder(X86::MOV64rr).addReg(X86::R8).addReg(X86::NoRegister + Reg), - STI); - OutStreamer->emitInstruction(MCInstBuilder(X86::SHR64ri) - .addReg(X86::R8) - .addReg(X86::R8) - .addImm(MappingScale), - STI); - if (OrShadowOffset) { - OutStreamer->emitInstruction(MCInstBuilder(X86::OR64ri32) - .addReg(X86::R8) - .addReg(X86::R8) - .addImm(ShadowBase), - STI); - auto OpCode = AccessInfo.AccessSizeIndex == 3 ? X86::CMP8mi : X86::CMP16mi8; - OutStreamer->emitInstruction(MCInstBuilder(OpCode) - .addReg(X86::R8) - .addImm(1) - .addReg(X86::NoRegister) - .addImm(0) - .addReg(X86::NoRegister) - .addImm(0), - STI); - } else { - auto OpCode = AccessInfo.AccessSizeIndex == 3 ? X86::CMP8mi : X86::CMP16mi8; - OutStreamer->emitInstruction(MCInstBuilder(OpCode) - .addReg(X86::R8) - .addImm(1) - .addReg(X86::NoRegister) - .addImm(ShadowBase) - .addReg(X86::NoRegister) - .addImm(0), - STI); - } - MCSymbol *ReportCode = OutContext.createTempSymbol(); - OutStreamer->emitInstruction( - MCInstBuilder(X86::JCC_1) - .addExpr(MCSymbolRefExpr::create(ReportCode, OutContext)) - .addImm(X86::COND_NE), - STI); - MCSymbol *ReturnSym = OutContext.createTempSymbol(); - OutStreamer->emitLabel(ReturnSym); - OutStreamer->emitInstruction(MCInstBuilder(getRetOpcode(*Subtarget)), STI); - - OutStreamer->emitLabel(ReportCode); - emitAsanReportError(M, Reg, AccessInfo, STI); -} + getAddressSanitizerParams(Triple(TM.getTargetTriple()), 64, + AccessInfo.CompileKernel, &ShadowBase, + &MappingScale, &OrShadowOffset); -void X86AsmPrinter::emitAsanReportError(Module &M, unsigned Reg, - const ASanAccessInfo &AccessInfo, - MCSubtargetInfo &STI) { std::string Name = AccessInfo.IsWrite ? "store" : "load"; - MCSymbol *ReportError = OutContext.getOrCreateSymbol( - "__asan_report_" + Name + utostr(1ULL << AccessInfo.AccessSizeIndex)); - OutStreamer->emitInstruction(MCInstBuilder(X86::MOV64rr) - .addReg(X86::RDI) - .addReg(X86::NoRegister + Reg), - STI); - OutStreamer->emitInstruction( - MCInstBuilder(X86::JMP_4) - .addExpr(MCSymbolRefExpr::create(ReportError, MCSymbolRefExpr::VK_PLT, - OutContext)), - STI); -} - -void X86AsmPrinter::emitAsanMemaccessSymbols(Module &M) { - if (AsanMemaccessSymbols.empty()) - return; - - const Triple &TT = TM.getTargetTriple(); - assert(TT.isOSBinFormatELF()); - std::unique_ptr<MCSubtargetInfo> STI( - TM.getTarget().createMCSubtargetInfo(TT.str(), "", "")); - assert(STI && "Unable to create subtarget info"); - - for (auto &P : AsanMemaccessSymbols) { - MCSymbol *Sym = P.second; - OutStreamer->SwitchSection(OutContext.getELFSection( - ".text.hot", ELF::SHT_PROGBITS, - ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0, Sym->getName(), - /*IsComdat=*/true)); - - OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction); - OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak); - OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden); - OutStreamer->emitLabel(Sym); + std::string Op = OrShadowOffset ? "or" : "add"; + std::string SymName = "__asan_check_" + Name + "_" + Op + "_" + + utostr(1ULL << AccessInfo.AccessSizeIndex) + "_" + + TM.getMCRegisterInfo()->getName(Reg.asMCReg()); + if (OrShadowOffset) + report_fatal_error( + "OrShadowOffset is not supported with optimized callbacks"); - unsigned Reg = std::get<0>(P.first); - ASanAccessInfo AccessInfo(std::get<1>(P.first)); - - if (AccessInfo.AccessSizeIndex < 3) { - emitAsanMemaccessPartial(M, Reg, AccessInfo, *STI); - } else { - emitAsanMemaccessFull(M, Reg, AccessInfo, *STI); - } - } + EmitAndCountInstruction( + MCInstBuilder(X86::CALL64pcrel32) + .addExpr(MCSymbolRefExpr::create( + OutContext.getOrCreateSymbol(SymName), OutContext))); } void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, @@ -2615,6 +2410,15 @@ void X86AsmPrinter::emitInstruction(const MachineInstr *MI) { const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); + if (MI->getOpcode() == X86::OR64rm) { + for (auto &Opd : MI->operands()) { + if (Opd.isSymbol() && StringRef(Opd.getSymbolName()) == + "swift_async_extendedFramePointerFlags") { + ShouldEmitWeakSwiftAsyncExtendedFramePointerFlags = true; + } + } + } + // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that // are compressed from EVEX encoding to VEX encoding. if (TM.Options.MCOptions.ShowMCEncoding) { diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index d835f452b67e..1b704bcb8e08 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -430,11 +430,11 @@ def GR64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, RBX, R14, R15, R12, R13, RBP, RSP, RIP)>; -// GR64 - 64-bit GPRs without R8 and RIP. Could be used when emitting code for -// intrinsics, which use implict input registers. -def GR64NoR8 : RegisterClass<"X86", [i64], 64, - (add RAX, RCX, RDX, RSI, RDI, R9, R10, R11, - RBX, R14, R15, R12, R13, RBP, RSP)>; +// GR64PLTSafe - 64-bit GPRs without R10, R11, RSP and RIP. Could be used when +// emitting code for intrinsics, which use implict input registers. +def GR64PLTSafe : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, R8, R9, + RBX, R14, R15, R12, R13, RBP)>; // Segment registers for use by MOV instructions (and others) that have a // segment register as one operand. Always contain a 16-bit segment diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 2827981b7fb0..a6ff472aac6f 100644 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -783,7 +783,7 @@ def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[BWWriteResGroup27], (instrs MMX_CVTPI2PSirr)>; +def: InstRW<[BWWriteResGroup27], (instrs MMX_CVTPI2PSrr)>; def: InstRW<[BWWriteResGroup27], (instregex "P(DEP|EXT)(32|64)rr", "(V?)CVTDQ2PS(Y?)rr")>; @@ -800,9 +800,9 @@ def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWirr, - MMX_PACKSSWBirr, - MMX_PACKUSWBirr)>; +def: InstRW<[BWWriteResGroup33], (instrs MMX_PACKSSDWrr, + MMX_PACKSSWBrr, + MMX_PACKUSWBrr)>; def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> { let Latency = 3; @@ -862,9 +862,9 @@ def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup42], (instrs MMX_CVTPI2PDirr)>; -def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVT(T?)PD2PIirr", - "MMX_CVT(T?)PS2PIirr", +def: InstRW<[BWWriteResGroup42], (instrs MMX_CVTPI2PDrr)>; +def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVT(T?)PD2PIrr", + "MMX_CVT(T?)PS2PIrr", "(V?)CVTDQ2PDrr", "(V?)CVTPD2PSrr", "(V?)CVTSD2SSrr", @@ -1086,9 +1086,9 @@ def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[BWWriteResGroup79], (instrs MMX_PACKSSDWirm, - MMX_PACKSSWBirm, - MMX_PACKUSWBirm)>; +def: InstRW<[BWWriteResGroup79], (instrs MMX_PACKSSDWrm, + MMX_PACKSSWBrm, + MMX_PACKUSWBrm)>; def BWWriteResGroup80 : SchedWriteRes<[BWPort23,BWPort0156]> { let Latency = 7; @@ -1155,7 +1155,7 @@ def BWWriteResGroup91 : SchedWriteRes<[BWPort1,BWPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup91], (instrs MMX_CVTPI2PSirm, +def: InstRW<[BWWriteResGroup91], (instrs MMX_CVTPI2PSrm, CVTDQ2PSrm, VCVTDQ2PSrm)>; def: InstRW<[BWWriteResGroup91], (instregex "P(DEP|EXT)(32|64)rm")>; @@ -1236,8 +1236,8 @@ def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { def: InstRW<[BWWriteResGroup107], (instrs CVTPD2PSrm, CVTPD2DQrm, CVTTPD2DQrm, - MMX_CVTPI2PDirm)>; -def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVT(T?)PD2PIirm", + MMX_CVTPI2PDrm)>; +def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVT(T?)PD2PIrm", "(V?)CVTDQ2PDrm", "(V?)CVTSD2SSrm")>; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 68961d6245ab..371a9571ae39 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -995,7 +995,7 @@ def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup12], (instrs MMX_CVTPI2PSirm)>; +def: InstRW<[HWWriteResGroup12], (instrs MMX_CVTPI2PSrm)>; def: InstRW<[HWWriteResGroup12], (instregex "P(DEP|EXT)(32|64)rm")>; def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> { @@ -1164,9 +1164,9 @@ def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[HWWriteResGroup36_2], (instrs MMX_PACKSSDWirm, - MMX_PACKSSWBirm, - MMX_PACKUSWBirm)>; +def: InstRW<[HWWriteResGroup36_2], (instrs MMX_PACKSSDWrm, + MMX_PACKSSWBrm, + MMX_PACKUSWBrm)>; def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> { let Latency = 7; @@ -1240,7 +1240,7 @@ def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[HWWriteResGroup50], (instrs MMX_CVTPI2PSirr)>; +def: InstRW<[HWWriteResGroup50], (instrs MMX_CVTPI2PSrr)>; def: InstRW<[HWWriteResGroup50], (instregex "P(DEP|EXT)(32|64)rr", "(V?)CVTDQ2PS(Y?)rr")>; @@ -1285,9 +1285,9 @@ def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWirr, - MMX_PACKSSWBirr, - MMX_PACKUSWBirr)>; +def: InstRW<[HWWriteResGroup57], (instrs MMX_PACKSSDWrr, + MMX_PACKSSWBrr, + MMX_PACKUSWBrr)>; def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> { let Latency = 3; @@ -1373,11 +1373,11 @@ def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup73], (instrs MMX_CVTPI2PDirr, - MMX_CVTPD2PIirr, - MMX_CVTPS2PIirr, - MMX_CVTTPD2PIirr, - MMX_CVTTPS2PIirr)>; +def: InstRW<[HWWriteResGroup73], (instrs MMX_CVTPI2PDrr, + MMX_CVTPD2PIrr, + MMX_CVTPS2PIrr, + MMX_CVTTPD2PIrr, + MMX_CVTTPS2PIrr)>; def: InstRW<[HWWriteResGroup73], (instregex "(V?)CVTDQ2PDrr", "(V?)CVTPD2PSrr", "(V?)CVTSD2SSrr", @@ -1418,8 +1418,8 @@ def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { def: InstRW<[HWWriteResGroup78], (instrs CVTPD2PSrm, CVTPD2DQrm, CVTTPD2DQrm, - MMX_CVTPD2PIirm, - MMX_CVTTPD2PIirm, + MMX_CVTPD2PIrm, + MMX_CVTTPD2PIrm, CVTDQ2PDrm, VCVTDQ2PDrm)>; @@ -1428,7 +1428,7 @@ def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } -def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDirm, +def: InstRW<[HWWriteResGroup78_1], (instrs MMX_CVTPI2PDrm, CVTSD2SSrm, CVTSD2SSrm_Int, VCVTSD2SSrm, VCVTSD2SSrm_Int)>; diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index 889b9b7fa666..789de9eb5751 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -331,12 +331,12 @@ defm : ICXWriteResPair<WriteFLogicZ, [ICXPort05], 1, [1], 1, 7>; defm : ICXWriteResPair<WriteFTest, [ICXPort0], 2, [1], 1, 6>; // Floating point TEST instructions. defm : ICXWriteResPair<WriteFTestY, [ICXPort0], 2, [1], 1, 7>; defm : ICXWriteResPair<WriteFTestZ, [ICXPort0], 2, [1], 1, 7>; -defm : ICXWriteResPair<WriteFShuffle, [ICXPort5], 1, [1], 1, 6>; // Floating point vector shuffles. -defm : ICXWriteResPair<WriteFShuffleY, [ICXPort5], 1, [1], 1, 7>; -defm : ICXWriteResPair<WriteFShuffleZ, [ICXPort5], 1, [1], 1, 7>; -defm : ICXWriteResPair<WriteFVarShuffle, [ICXPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles. -defm : ICXWriteResPair<WriteFVarShuffleY, [ICXPort5], 1, [1], 1, 7>; -defm : ICXWriteResPair<WriteFVarShuffleZ, [ICXPort5], 1, [1], 1, 7>; +defm : ICXWriteResPair<WriteFShuffle, [ICXPort15], 1, [1], 1, 6>; // Floating point vector shuffles. +defm : ICXWriteResPair<WriteFShuffleY, [ICXPort15], 1, [1], 1, 7>; +defm : ICXWriteResPair<WriteFShuffleZ, [ICXPort5], 1, [1], 1, 7>; +defm : ICXWriteResPair<WriteFVarShuffle, [ICXPort15], 1, [1], 1, 6>; // Floating point vector variable shuffles. +defm : ICXWriteResPair<WriteFVarShuffleY, [ICXPort15], 1, [1], 1, 7>; +defm : ICXWriteResPair<WriteFVarShuffleZ, [ICXPort5], 1, [1], 1, 7>; defm : ICXWriteResPair<WriteFBlend, [ICXPort015], 1, [1], 1, 6>; // Floating point vector blends. defm : ICXWriteResPair<WriteFBlendY,[ICXPort015], 1, [1], 1, 7>; defm : ICXWriteResPair<WriteFBlendZ,[ICXPort015], 1, [1], 1, 7>; @@ -388,14 +388,14 @@ defm : ICXWriteResPair<WriteVecIMulZ, [ICXPort05], 5, [1], 1, 7>; defm : ICXWriteResPair<WritePMULLD, [ICXPort01], 10, [2], 2, 6>; // Vector PMULLD. defm : ICXWriteResPair<WritePMULLDY, [ICXPort01], 10, [2], 2, 7>; defm : ICXWriteResPair<WritePMULLDZ, [ICXPort05], 10, [2], 2, 7>; -defm : ICXWriteResPair<WriteShuffle, [ICXPort5], 1, [1], 1, 5>; // Vector shuffles. -defm : ICXWriteResPair<WriteShuffleX, [ICXPort5], 1, [1], 1, 6>; -defm : ICXWriteResPair<WriteShuffleY, [ICXPort5], 1, [1], 1, 7>; -defm : ICXWriteResPair<WriteShuffleZ, [ICXPort5], 1, [1], 1, 7>; -defm : ICXWriteResPair<WriteVarShuffle, [ICXPort5], 1, [1], 1, 5>; // Vector variable shuffles. -defm : ICXWriteResPair<WriteVarShuffleX, [ICXPort5], 1, [1], 1, 6>; -defm : ICXWriteResPair<WriteVarShuffleY, [ICXPort5], 1, [1], 1, 7>; -defm : ICXWriteResPair<WriteVarShuffleZ, [ICXPort5], 1, [1], 1, 7>; +defm : ICXWriteResPair<WriteShuffle, [ICXPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : ICXWriteResPair<WriteShuffleX, [ICXPort15], 1, [1], 1, 6>; +defm : ICXWriteResPair<WriteShuffleY, [ICXPort15], 1, [1], 1, 7>; +defm : ICXWriteResPair<WriteShuffleZ, [ICXPort5], 1, [1], 1, 7>; +defm : ICXWriteResPair<WriteVarShuffle, [ICXPort5], 1, [1], 1, 5>; // Vector variable shuffles. +defm : ICXWriteResPair<WriteVarShuffleX, [ICXPort15], 1, [1], 1, 6>; +defm : ICXWriteResPair<WriteVarShuffleY, [ICXPort15], 1, [1], 1, 7>; +defm : ICXWriteResPair<WriteVarShuffleZ, [ICXPort5], 1, [1], 1, 7>; defm : ICXWriteResPair<WriteBlend, [ICXPort5], 1, [1], 1, 6>; // Vector blends. defm : ICXWriteResPair<WriteBlendY,[ICXPort5], 1, [1], 1, 7>; defm : ICXWriteResPair<WriteBlendZ,[ICXPort5], 1, [1], 1, 7>; @@ -642,15 +642,15 @@ def: InstRW<[ICXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr", "KXOR(B|D|Q|W)rr", "KSET0(B|D|Q|W)", // Same as KXOR "KSET1(B|D|Q|W)", // Same as KXNOR - "MMX_PADDS(B|W)irr", - "MMX_PADDUS(B|W)irr", - "MMX_PAVG(B|W)irr", - "MMX_PCMPEQ(B|D|W)irr", - "MMX_PCMPGT(B|D|W)irr", - "MMX_P(MAX|MIN)SWirr", - "MMX_P(MAX|MIN)UBirr", - "MMX_PSUBS(B|W)irr", - "MMX_PSUBUS(B|W)irr", + "MMX_PADDS(B|W)rr", + "MMX_PADDUS(B|W)rr", + "MMX_PAVG(B|W)rr", + "MMX_PCMPEQ(B|D|W)rr", + "MMX_PCMPGT(B|D|W)rr", + "MMX_P(MAX|MIN)SWrr", + "MMX_P(MAX|MIN)UBrr", + "MMX_PSUBS(B|W)rr", + "MMX_PSUBUS(B|W)rr", "VPMOVB2M(Z|Z128|Z256)rr", "VPMOVD2M(Z|Z128|Z256)rr", "VPMOVQ2M(Z|Z128|Z256)rr", @@ -663,7 +663,16 @@ def ICXWriteResGroup3 : SchedWriteRes<[ICXPort5]> { } def: InstRW<[ICXWriteResGroup3], (instregex "COM(P?)_FST0r", "KMOV(B|D|Q|W)kr", - "UCOM_F(P?)r")>; + "UCOM_F(P?)r", + "VPBROADCAST(D|Q)rr", + "(V?)INSERTPS(Z?)rr", + "(V?)MOV(HL|LH)PS(Z?)rr", + "(V?)MOVDDUP(Y|Z|Z128|Z256)?rr", + "(V?)PALIGNR(Y|Z|Z128|Z256)?rri", + "(V?)PERMIL(PD|PS)(Y|Z|Z128|Z256)?ri", + "(V?)PERMIL(PD|PS)(Y|Z|Z128|Z256)?rr", + "(V?)PACK(U|S)S(DW|WB)(Y|Z|Z128|Z256)?rr", + "(V?)UNPCK(L|H)(PD|PS)(Y|Z|Z128|Z256)?rr")>; def ICXWriteResGroup4 : SchedWriteRes<[ICXPort6]> { let Latency = 1; @@ -702,6 +711,7 @@ def: InstRW<[ICXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr", "VBLENDMPS(Z128|Z256)rr", "VPADD(B|D|Q|W)(Y|Z|Z128|Z256)rr", "(V?)PADD(B|D|Q|W)rr", + "(V?)MOV(SD|SS)(Z?)rr", "VPBLENDD(Y?)rri", "VPBLENDMB(Z128|Z256)rr", "VPBLENDMD(Z128|Z256)rr", @@ -892,9 +902,9 @@ def ICXWriteResGroup41 : SchedWriteRes<[ICXPort5,ICXPort0156]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[ICXWriteResGroup41], (instrs MMX_PACKSSDWirr, - MMX_PACKSSWBirr, - MMX_PACKUSWBirr)>; +def: InstRW<[ICXWriteResGroup41], (instrs MMX_PACKSSDWrr, + MMX_PACKSSWBrr, + MMX_PACKUSWBrr)>; def ICXWriteResGroup42 : SchedWriteRes<[ICXPort6,ICXPort0156]> { let Latency = 3; @@ -1055,8 +1065,8 @@ def ICXWriteResGroup61 : SchedWriteRes<[ICXPort5,ICXPort015]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[ICXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr", - "MMX_CVT(T?)PS2PIirr", +def: InstRW<[ICXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIrr", + "MMX_CVT(T?)PS2PIrr", "VCVTDQ2PDZ128rr", "VCVTPD2DQZ128rr", "(V?)CVT(T?)PD2DQrr", @@ -1162,7 +1172,7 @@ def ICXWriteResGroup72 : SchedWriteRes<[ICXPort5]> { let NumMicroOps = 2; let ResourceCycles = [2]; } -def: InstRW<[ICXWriteResGroup72], (instrs MMX_CVTPI2PSirr)>; +def: InstRW<[ICXWriteResGroup72], (instrs MMX_CVTPI2PSrr)>; def: InstRW<[ICXWriteResGroup72], (instregex "VCOMPRESSPD(Z|Z128|Z256)rr", "VCOMPRESSPS(Z|Z128|Z256)rr", "VPCOMPRESSD(Z|Z128|Z256)rr", @@ -1174,26 +1184,26 @@ def ICXWriteResGroup73 : SchedWriteRes<[ICXPort0,ICXPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[ICXWriteResGroup73], (instrs MMX_PADDSBirm, - MMX_PADDSWirm, - MMX_PADDUSBirm, - MMX_PADDUSWirm, - MMX_PAVGBirm, - MMX_PAVGWirm, - MMX_PCMPEQBirm, - MMX_PCMPEQDirm, - MMX_PCMPEQWirm, - MMX_PCMPGTBirm, - MMX_PCMPGTDirm, - MMX_PCMPGTWirm, - MMX_PMAXSWirm, - MMX_PMAXUBirm, - MMX_PMINSWirm, - MMX_PMINUBirm, - MMX_PSUBSBirm, - MMX_PSUBSWirm, - MMX_PSUBUSBirm, - MMX_PSUBUSWirm)>; +def: InstRW<[ICXWriteResGroup73], (instrs MMX_PADDSBrm, + MMX_PADDSWrm, + MMX_PADDUSBrm, + MMX_PADDUSWrm, + MMX_PAVGBrm, + MMX_PAVGWrm, + MMX_PCMPEQBrm, + MMX_PCMPEQDrm, + MMX_PCMPEQWrm, + MMX_PCMPGTBrm, + MMX_PCMPGTDrm, + MMX_PCMPGTWrm, + MMX_PMAXSWrm, + MMX_PMAXUBrm, + MMX_PMINSWrm, + MMX_PMINUBrm, + MMX_PSUBSBrm, + MMX_PSUBSWrm, + MMX_PSUBUSBrm, + MMX_PSUBUSWrm)>; def ICXWriteResGroup76 : SchedWriteRes<[ICXPort6,ICXPort23]> { let Latency = 6; @@ -1295,20 +1305,14 @@ def ICXWriteResGroup92 : SchedWriteRes<[ICXPort5,ICXPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[ICXWriteResGroup92], (instregex "VMOVSDZrm(b?)", - "VMOVSSZrm(b?)")>; - -def ICXWriteResGroup92a : SchedWriteRes<[ICXPort5,ICXPort23]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[ICXWriteResGroup92a], (instregex "(V?)PMOV(SX|ZX)BDrm", - "(V?)PMOV(SX|ZX)BQrm", - "(V?)PMOV(SX|ZX)BWrm", - "(V?)PMOV(SX|ZX)DQrm", - "(V?)PMOV(SX|ZX)WDrm", - "(V?)PMOV(SX|ZX)WQrm")>; +def: InstRW<[ICXWriteResGroup92], (instregex "VMOV(SD|SS)Zrm(b?)", + "VPBROADCAST(B|W)(Z128)?rm", + "(V?)INSERTPS(Z?)rm", + "(V?)PALIGNR(Z128)?rmi", + "(V?)PERMIL(PD|PS)(Z128)?m(b?)i", + "(V?)PERMIL(PD|PS)(Z128)?rm", + "(V?)PACK(U|S)S(DW|WB)(Z128)?rm", + "(V?)UNPCK(L|H)(PD|PS)(Z128)?rm")>; def ICXWriteResGroup93 : SchedWriteRes<[ICXPort5,ICXPort015]> { let Latency = 7; @@ -1391,9 +1395,9 @@ def ICXWriteResGroup96 : SchedWriteRes<[ICXPort5,ICXPort23]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[ICXWriteResGroup96], (instrs MMX_PACKSSDWirm, - MMX_PACKSSWBirm, - MMX_PACKUSWBirm)>; +def: InstRW<[ICXWriteResGroup96], (instrs MMX_PACKSSDWrm, + MMX_PACKSSWBrm, + MMX_PACKUSWBrm)>; def ICXWriteResGroup97 : SchedWriteRes<[ICXPort5,ICXPort015]> { let Latency = 7; @@ -1546,7 +1550,12 @@ def ICXWriteResGroup119 : SchedWriteRes<[ICXPort5,ICXPort23]> { } def: InstRW<[ICXWriteResGroup119], (instregex "FCOM(P?)(32|64)m", "VPBROADCASTB(Z|Z256)rm(b?)", - "VPBROADCASTW(Z|Z256)rm(b?)")>; + "VPBROADCASTW(Z|Z256)rm(b?)", + "(V?)PALIGNR(Y|Z|Z256)rmi", + "(V?)PERMIL(PD|PS)(Y|Z|Z256)m(b?)i", + "(V?)PERMIL(PD|PS)(Y|Z|Z256)rm", + "(V?)PACK(U|S)S(DW|WB)(Y|Z|Z256)rm", + "(V?)UNPCK(L|H)(PD|PS)(Y|Z|Z256)rm")>; def: InstRW<[ICXWriteResGroup119], (instrs VPBROADCASTBYrm, VPBROADCASTWYrm, VPMOVSXBDYrm, @@ -1683,7 +1692,7 @@ def ICXWriteResGroup135 : SchedWriteRes<[ICXPort0,ICXPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[ICXWriteResGroup135], (instrs MMX_CVTPI2PSirm)>; +def: InstRW<[ICXWriteResGroup135], (instrs MMX_CVTPI2PSrm)>; def ICXWriteResGroup136 : SchedWriteRes<[ICXPort5,ICXPort23]> { let Latency = 9; @@ -1709,19 +1718,7 @@ def: InstRW<[ICXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i", "VPMAXSQZ128rm(b?)", "VPMAXUQZ128rm(b?)", "VPMINSQZ128rm(b?)", - "VPMINUQZ128rm(b?)", - "VPMOVSXBDZ128rm(b?)", - "VPMOVSXBQZ128rm(b?)", - "VPMOVSXBWZ128rm(b?)", - "VPMOVSXDQZ128rm(b?)", - "VPMOVSXWDZ128rm(b?)", - "VPMOVSXWQZ128rm(b?)", - "VPMOVZXBDZ128rm(b?)", - "VPMOVZXBQZ128rm(b?)", - "VPMOVZXBWZ128rm(b?)", - "VPMOVZXDQZ128rm(b?)", - "VPMOVZXWDZ128rm(b?)", - "VPMOVZXWQZ128rm(b?)")>; + "VPMINUQZ128rm(b?)")>; def ICXWriteResGroup136_2 : SchedWriteRes<[ICXPort5,ICXPort23]> { let Latency = 10; @@ -1753,7 +1750,7 @@ def ICXWriteResGroup137 : SchedWriteRes<[ICXPort23,ICXPort015]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm", +def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIrm", "(V?)CVTPS2PDrm")>; def ICXWriteResGroup143 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> { @@ -1950,8 +1947,8 @@ def ICXWriteResGroup166 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> { def: InstRW<[ICXWriteResGroup166], (instrs CVTPD2PSrm, CVTPD2DQrm, CVTTPD2DQrm, - MMX_CVTPD2PIirm, - MMX_CVTTPD2PIirm)>; + MMX_CVTPD2PIrm, + MMX_CVTTPD2PIrm)>; def ICXWriteResGroup167 : SchedWriteRes<[ICXPort5,ICXPort23,ICXPort015]> { let Latency = 11; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index c8d7b0f72c1c..af5c0540deb5 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -623,7 +623,7 @@ def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> { def: InstRW<[SBWriteResGroup5], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr, - MMX_PADDQirr, + MMX_PADDQrr, MMX_PALIGNRrri, MMX_PSIGNBrr, MMX_PSIGNDrr, @@ -870,7 +870,7 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQirm)>; +def: InstRW<[SBWriteResGroup59], (instrs MMX_PADDQrm)>; def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> { let Latency = 7; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 7d3229c3b023..b3c13c72dd01 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -624,15 +624,15 @@ def SKLWriteResGroup1 : SchedWriteRes<[SKLPort0]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDS(B|W)irr", - "MMX_PADDUS(B|W)irr", - "MMX_PAVG(B|W)irr", - "MMX_PCMPEQ(B|D|W)irr", - "MMX_PCMPGT(B|D|W)irr", - "MMX_P(MAX|MIN)SWirr", - "MMX_P(MAX|MIN)UBirr", - "MMX_PSUBS(B|W)irr", - "MMX_PSUBUS(B|W)irr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDS(B|W)rr", + "MMX_PADDUS(B|W)rr", + "MMX_PAVG(B|W)rr", + "MMX_PCMPEQ(B|D|W)rr", + "MMX_PCMPGT(B|D|W)rr", + "MMX_P(MAX|MIN)SWrr", + "MMX_P(MAX|MIN)UBrr", + "MMX_PSUBS(B|W)rr", + "MMX_PSUBUS(B|W)rr")>; def SKLWriteResGroup3 : SchedWriteRes<[SKLPort5]> { let Latency = 1; @@ -815,9 +815,9 @@ def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[SKLWriteResGroup39], (instrs MMX_PACKSSDWirr, - MMX_PACKSSWBirr, - MMX_PACKUSWBirr)>; +def: InstRW<[SKLWriteResGroup39], (instrs MMX_PACKSSDWrr, + MMX_PACKSSWBrr, + MMX_PACKUSWBrr)>; def SKLWriteResGroup40 : SchedWriteRes<[SKLPort6,SKLPort0156]> { let Latency = 3; @@ -927,7 +927,7 @@ def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup59], (instrs MMX_CVTPI2PDirr, +def: InstRW<[SKLWriteResGroup59], (instrs MMX_CVTPI2PDrr, CVTDQ2PDrr, VCVTDQ2PDrr)>; @@ -936,8 +936,8 @@ def SKLWriteResGroup60 : SchedWriteRes<[SKLPort5,SKLPort015]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIirr", - "MMX_CVT(T?)PS2PIirr", +def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIrr", + "MMX_CVT(T?)PS2PIrr", "(V?)CVT(T?)PD2DQrr", "(V?)CVTPD2PSrr", "(V?)CVTPS2PDrr", @@ -984,33 +984,33 @@ def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> { let NumMicroOps = 2; let ResourceCycles = [2]; } -def: InstRW<[SKLWriteResGroup68], (instrs MMX_CVTPI2PSirr)>; +def: InstRW<[SKLWriteResGroup68], (instrs MMX_CVTPI2PSrr)>; def SKLWriteResGroup69 : SchedWriteRes<[SKLPort0,SKLPort23]> { let Latency = 6; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup69], (instrs MMX_PADDSBirm, - MMX_PADDSWirm, - MMX_PADDUSBirm, - MMX_PADDUSWirm, - MMX_PAVGBirm, - MMX_PAVGWirm, - MMX_PCMPEQBirm, - MMX_PCMPEQDirm, - MMX_PCMPEQWirm, - MMX_PCMPGTBirm, - MMX_PCMPGTDirm, - MMX_PCMPGTWirm, - MMX_PMAXSWirm, - MMX_PMAXUBirm, - MMX_PMINSWirm, - MMX_PMINUBirm, - MMX_PSUBSBirm, - MMX_PSUBSWirm, - MMX_PSUBUSBirm, - MMX_PSUBUSWirm)>; +def: InstRW<[SKLWriteResGroup69], (instrs MMX_PADDSBrm, + MMX_PADDSWrm, + MMX_PADDUSBrm, + MMX_PADDUSWrm, + MMX_PAVGBrm, + MMX_PAVGWrm, + MMX_PCMPEQBrm, + MMX_PCMPEQDrm, + MMX_PCMPEQWrm, + MMX_PCMPGTBrm, + MMX_PCMPGTDrm, + MMX_PCMPGTWrm, + MMX_PMAXSWrm, + MMX_PMAXUBrm, + MMX_PMINSWrm, + MMX_PMINUBrm, + MMX_PSUBSBrm, + MMX_PSUBSWrm, + MMX_PSUBUSBrm, + MMX_PSUBUSWrm)>; def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort01]> { let Latency = 6; @@ -1144,9 +1144,9 @@ def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[SKLWriteResGroup92], (instrs MMX_PACKSSDWirm, - MMX_PACKSSWBirm, - MMX_PACKUSWBirm)>; +def: InstRW<[SKLWriteResGroup92], (instrs MMX_PACKSSDWrm, + MMX_PACKSSWBrm, + MMX_PACKUSWBrm)>; def SKLWriteResGroup94 : SchedWriteRes<[SKLPort23,SKLPort0156]> { let Latency = 7; @@ -1283,7 +1283,7 @@ def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup120], (instrs MMX_CVTPI2PSirm)>; +def: InstRW<[SKLWriteResGroup120], (instrs MMX_CVTPI2PSrm)>; def SKLWriteResGroup121 : SchedWriteRes<[SKLPort5,SKLPort23]> { let Latency = 9; @@ -1302,7 +1302,7 @@ def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort01]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIirm", +def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIrm", "(V?)CVTPS2PDrm")>; def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { @@ -1345,7 +1345,7 @@ def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } -def: InstRW<[SKLWriteResGroup138], (instrs MMX_CVTPI2PDirm)>; +def: InstRW<[SKLWriteResGroup138], (instrs MMX_CVTPI2PDrm)>; def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> { let Latency = 10; @@ -1425,8 +1425,8 @@ def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> { def: InstRW<[SKLWriteResGroup152], (instrs CVTPD2PSrm, CVTPD2DQrm, CVTTPD2DQrm, - MMX_CVTPD2PIirm, - MMX_CVTTPD2PIirm)>; + MMX_CVTPD2PIrm, + MMX_CVTTPD2PIrm)>; def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { let Latency = 11; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 1d8417aef41e..74f9da158353 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -634,15 +634,15 @@ def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr", "KXOR(B|D|Q|W)rr", "KSET0(B|D|Q|W)", // Same as KXOR "KSET1(B|D|Q|W)", // Same as KXNOR - "MMX_PADDS(B|W)irr", - "MMX_PADDUS(B|W)irr", - "MMX_PAVG(B|W)irr", - "MMX_PCMPEQ(B|D|W)irr", - "MMX_PCMPGT(B|D|W)irr", - "MMX_P(MAX|MIN)SWirr", - "MMX_P(MAX|MIN)UBirr", - "MMX_PSUBS(B|W)irr", - "MMX_PSUBUS(B|W)irr", + "MMX_PADDS(B|W)rr", + "MMX_PADDUS(B|W)rr", + "MMX_PAVG(B|W)rr", + "MMX_PCMPEQ(B|D|W)rr", + "MMX_PCMPGT(B|D|W)rr", + "MMX_P(MAX|MIN)SWrr", + "MMX_P(MAX|MIN)UBrr", + "MMX_PSUBS(B|W)rr", + "MMX_PSUBUS(B|W)rr", "VPMOVB2M(Z|Z128|Z256)rr", "VPMOVD2M(Z|Z128|Z256)rr", "VPMOVQ2M(Z|Z128|Z256)rr", @@ -884,9 +884,9 @@ def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[SKXWriteResGroup41], (instrs MMX_PACKSSDWirr, - MMX_PACKSSWBirr, - MMX_PACKUSWBirr)>; +def: InstRW<[SKXWriteResGroup41], (instrs MMX_PACKSSDWrr, + MMX_PACKSSWBrr, + MMX_PACKUSWBrr)>; def SKXWriteResGroup42 : SchedWriteRes<[SKXPort6,SKXPort0156]> { let Latency = 3; @@ -1047,8 +1047,8 @@ def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr", - "MMX_CVT(T?)PS2PIirr", +def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIrr", + "MMX_CVT(T?)PS2PIrr", "VCVTDQ2PDZ128rr", "VCVTPD2DQZ128rr", "(V?)CVT(T?)PD2DQrr", @@ -1154,7 +1154,7 @@ def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> { let NumMicroOps = 2; let ResourceCycles = [2]; } -def: InstRW<[SKXWriteResGroup72], (instrs MMX_CVTPI2PSirr)>; +def: InstRW<[SKXWriteResGroup72], (instrs MMX_CVTPI2PSrr)>; def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPD(Z|Z128|Z256)rr", "VCOMPRESSPS(Z|Z128|Z256)rr", "VPCOMPRESSD(Z|Z128|Z256)rr", @@ -1166,26 +1166,26 @@ def SKXWriteResGroup73 : SchedWriteRes<[SKXPort0,SKXPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup73], (instrs MMX_PADDSBirm, - MMX_PADDSWirm, - MMX_PADDUSBirm, - MMX_PADDUSWirm, - MMX_PAVGBirm, - MMX_PAVGWirm, - MMX_PCMPEQBirm, - MMX_PCMPEQDirm, - MMX_PCMPEQWirm, - MMX_PCMPGTBirm, - MMX_PCMPGTDirm, - MMX_PCMPGTWirm, - MMX_PMAXSWirm, - MMX_PMAXUBirm, - MMX_PMINSWirm, - MMX_PMINUBirm, - MMX_PSUBSBirm, - MMX_PSUBSWirm, - MMX_PSUBUSBirm, - MMX_PSUBUSWirm)>; +def: InstRW<[SKXWriteResGroup73], (instrs MMX_PADDSBrm, + MMX_PADDSWrm, + MMX_PADDUSBrm, + MMX_PADDUSWrm, + MMX_PAVGBrm, + MMX_PAVGWrm, + MMX_PCMPEQBrm, + MMX_PCMPEQDrm, + MMX_PCMPEQWrm, + MMX_PCMPGTBrm, + MMX_PCMPGTDrm, + MMX_PCMPGTWrm, + MMX_PMAXSWrm, + MMX_PMAXUBrm, + MMX_PMINSWrm, + MMX_PMINUBrm, + MMX_PSUBSBrm, + MMX_PSUBSWrm, + MMX_PSUBUSBrm, + MMX_PSUBUSWrm)>; def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> { let Latency = 6; @@ -1383,9 +1383,9 @@ def SKXWriteResGroup96 : SchedWriteRes<[SKXPort5,SKXPort23]> { let NumMicroOps = 3; let ResourceCycles = [2,1]; } -def: InstRW<[SKXWriteResGroup96], (instrs MMX_PACKSSDWirm, - MMX_PACKSSWBirm, - MMX_PACKUSWBirm)>; +def: InstRW<[SKXWriteResGroup96], (instrs MMX_PACKSSDWrm, + MMX_PACKSSWBrm, + MMX_PACKUSWBrm)>; def SKXWriteResGroup97 : SchedWriteRes<[SKXPort5,SKXPort015]> { let Latency = 7; @@ -1675,7 +1675,7 @@ def SKXWriteResGroup135 : SchedWriteRes<[SKXPort0,SKXPort23]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup135], (instrs MMX_CVTPI2PSirm)>; +def: InstRW<[SKXWriteResGroup135], (instrs MMX_CVTPI2PSrm)>; def SKXWriteResGroup136 : SchedWriteRes<[SKXPort5,SKXPort23]> { let Latency = 9; @@ -1701,19 +1701,7 @@ def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i", "VPMAXSQZ128rm(b?)", "VPMAXUQZ128rm(b?)", "VPMINSQZ128rm(b?)", - "VPMINUQZ128rm(b?)", - "VPMOVSXBDZ128rm(b?)", - "VPMOVSXBQZ128rm(b?)", - "VPMOVSXBWZ128rm(b?)", - "VPMOVSXDQZ128rm(b?)", - "VPMOVSXWDZ128rm(b?)", - "VPMOVSXWQZ128rm(b?)", - "VPMOVZXBDZ128rm(b?)", - "VPMOVZXBQZ128rm(b?)", - "VPMOVZXBWZ128rm(b?)", - "VPMOVZXDQZ128rm(b?)", - "VPMOVZXWDZ128rm(b?)", - "VPMOVZXWQZ128rm(b?)")>; + "VPMINUQZ128rm(b?)")>; def SKXWriteResGroup136_2 : SchedWriteRes<[SKXPort5,SKXPort23]> { let Latency = 10; @@ -1745,7 +1733,7 @@ def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm", +def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIrm", "(V?)CVTPS2PDrm")>; def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> { @@ -1942,8 +1930,8 @@ def SKXWriteResGroup166 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { def: InstRW<[SKXWriteResGroup166], (instrs CVTPD2PSrm, CVTPD2DQrm, CVTTPD2DQrm, - MMX_CVTPD2PIirm, - MMX_CVTTPD2PIirm)>; + MMX_CVTPD2PIrm, + MMX_CVTTPD2PIrm)>; def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { let Latency = 11; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index 6fd98280f560..0fedfc01092c 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -320,30 +320,30 @@ defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; // Conversions. //////////////////////////////////////////////////////////////////////////////// -defm : AtomWriteResPair<WriteCvtSS2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 8, 9, [7,7], [6,6]>; -defm : AtomWriteResPair<WriteCvtPS2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>; +defm : AtomWriteResPair<WriteCvtSS2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 8, 9, [8,8], [9,9], 3, 4>; +defm : AtomWriteResPair<WriteCvtPS2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [6,6], [7,7], 3, 4>; defm : X86WriteResPairUnsupported<WriteCvtPS2IY>; defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; -defm : AtomWriteResPair<WriteCvtSD2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 8, 9, [7,7], [6,6]>; -defm : AtomWriteResPair<WriteCvtPD2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7]>; +defm : AtomWriteResPair<WriteCvtSD2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 8, 9, [8,8],[10,10], 3, 4>; +defm : AtomWriteResPair<WriteCvtPD2I, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [7,7], [8,8], 4, 5>; defm : X86WriteResPairUnsupported<WriteCvtPD2IY>; defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; -defm : AtomWriteResPair<WriteCvtI2SS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>; -defm : AtomWriteResPair<WriteCvtI2PS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>; +defm : AtomWriteResPair<WriteCvtI2SS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [6,6], [6,6], 3, 1>; +defm : AtomWriteResPair<WriteCvtI2PS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [6,6], [7,7], 3, 4>; defm : X86WriteResPairUnsupported<WriteCvtI2PSY>; defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; -defm : AtomWriteResPair<WriteCvtI2SD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>; -defm : AtomWriteResPair<WriteCvtI2PD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7]>; +defm : AtomWriteResPair<WriteCvtI2SD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [6,6], [7,7], 3, 3>; +defm : AtomWriteResPair<WriteCvtI2PD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7], 3, 4>; defm : X86WriteResPairUnsupported<WriteCvtI2PDY>; defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; -defm : AtomWriteResPair<WriteCvtSS2SD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>; -defm : AtomWriteResPair<WriteCvtPS2PD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7]>; +defm : AtomWriteResPair<WriteCvtSS2SD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [6,6], [7,7], 3, 4>; +defm : AtomWriteResPair<WriteCvtPS2PD, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7], 4, 5>; defm : X86WriteResPairUnsupported<WriteCvtPS2PDY>; defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; -defm : AtomWriteResPair<WriteCvtSD2SS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 6, 7, [5,5], [6,6]>; -defm : AtomWriteResPair<WriteCvtPD2PS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 7, 8, [6,6], [7,7]>; +defm : AtomWriteResPair<WriteCvtSD2SS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 10, 11,[10,10],[12,12], 3, 4>; +defm : AtomWriteResPair<WriteCvtPD2PS, [AtomPort0,AtomPort1], [AtomPort0,AtomPort1], 11, 12,[11,11],[12,12], 4, 5>; defm : X86WriteResPairUnsupported<WriteCvtPD2PSY>; defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; @@ -525,8 +525,8 @@ def AtomWrite1_5 : SchedWriteRes<[AtomPort1]> { let Latency = 5; let ResourceCycles = [5]; } -def : InstRW<[AtomWrite1_5], (instrs MMX_CVTPI2PSirr, MMX_CVTPI2PSirm, - MMX_CVTPS2PIirr, MMX_CVTTPS2PIirr)>; +def : InstRW<[AtomWrite1_5], (instrs MMX_CVTPI2PSrr, MMX_CVTPI2PSrm, + MMX_CVTPS2PIrr, MMX_CVTTPS2PIrr)>; // Port0 and Port1 def AtomWrite0_1_1 : SchedWriteRes<[AtomPort0, AtomPort1]> { @@ -547,9 +547,43 @@ def AtomWrite0_1_5 : SchedWriteRes<[AtomPort0, AtomPort1]> { let Latency = 5; let ResourceCycles = [5, 5]; } -def : InstRW<[AtomWrite0_1_5], (instrs MMX_CVTPS2PIirm, MMX_CVTTPS2PIirm)>; +def : InstRW<[AtomWrite0_1_5], (instrs MMX_CVTPS2PIrm, MMX_CVTTPS2PIrm)>; def : InstRW<[AtomWrite0_1_5], (instregex "ILD_F(16|32|64)")>; +def AtomWrite0_1_7 : SchedWriteRes<[AtomPort0,AtomPort1]> { + let Latency = 7; + let ResourceCycles = [6,6]; +} +def : InstRW<[AtomWrite0_1_7], (instregex "CVTSI642SDrm(_Int)?")>; + +def AtomWrite0_1_7_4 : SchedWriteRes<[AtomPort0,AtomPort1]> { + let Latency = 7; + let ResourceCycles = [8,8]; + let NumMicroOps = 4; +} +def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrr(_Int)?")>; + +def AtomWrite0_1_8_4 : SchedWriteRes<[AtomPort0,AtomPort1]> { + let Latency = 8; + let ResourceCycles = [8,8]; + let NumMicroOps = 4; +} +def : InstRW<[AtomWrite0_1_7_4], (instregex "CVTSI642SSrm(_Int)?")>; + +def AtomWrite0_1_9 : SchedWriteRes<[AtomPort0,AtomPort1]> { + let Latency = 9; + let ResourceCycles = [9,9]; + let NumMicroOps = 4; +} +def : InstRW<[AtomWrite0_1_9], (instregex "CVT(T)?SS2SI64rr(_Int)?")>; + +def AtomWrite0_1_10 : SchedWriteRes<[AtomPort0,AtomPort1]> { + let Latency = 10; + let ResourceCycles = [11,11]; + let NumMicroOps = 5; +} +def : InstRW<[AtomWrite0_1_10], (instregex "CVT(T)?SS2SI64rm(_Int)?")>; + // Port0 or Port1 def AtomWrite01_1 : SchedWriteRes<[AtomPort01]> { let Latency = 1; @@ -570,7 +604,7 @@ def : InstRW<[AtomWrite01_2], (instrs LEAVE, LEAVE64, POP16r, SCASB, SCASL, SCASQ, SCASW)>; def : InstRW<[AtomWrite01_2], (instregex "PUSH(CS|DS|ES|FS|GS|SS)(16|32|64)", "(ST|ISTT)_F(P)?(16|32|64)?(m|rr)", - "MMX_P(ADD|SUB)Qirr", + "MMX_P(ADD|SUB)Qrr", "MOV(S|Z)X16rr8", "MOV(UPS|UPD|DQU)mr", "MASKMOVDQU(64)?", @@ -589,7 +623,7 @@ def : InstRW<[AtomWrite01_3], (instregex "XADD(8|16|32|64)rm", "XCHG(8|16|32|64)rm", "PH(ADD|SUB)Drr", "MOV(S|Z)X16rm8", - "MMX_P(ADD|SUB)Qirm", + "MMX_P(ADD|SUB)Qrm", "MOV(UPS|UPD|DQU)rm", "P(ADD|SUB)Qrm")>; @@ -647,15 +681,13 @@ def : InstRW<[AtomWrite01_9], (instrs POPA16, POPA32, SHLD64mri8, SHRD64mri8, SHLD64rri8, SHRD64rri8, CMPXCHG8rr)>; -def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F", - "CVT(T)?SS2SI64rr(_Int)?")>; +def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F")>; def AtomWrite01_10 : SchedWriteRes<[AtomPort01]> { let Latency = 10; let ResourceCycles = [10]; } def : SchedAlias<WriteFLDC, AtomWrite01_10>; -def : InstRW<[AtomWrite01_10], (instregex "CVT(T)?SS2SI64rm(_Int)?")>; def AtomWrite01_11 : SchedWriteRes<[AtomPort01]> { let Latency = 11; diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td index 4c16b5b52b1d..0f6f24f9f1fe 100644 --- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td @@ -1008,11 +1008,11 @@ defm : PdWriteResXMMPair<WriteCvtPD2I, [PdFPU0, PdFPCVT, PdFPSTO], 8, defm : PdWriteResYMMPair<WriteCvtPD2IY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>; defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; -def PdWriteMMX_CVTTPD2PIirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { +def PdWriteMMX_CVTTPD2PIrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 6; let NumMicroOps = 2; } -def : InstRW<[PdWriteMMX_CVTTPD2PIirr], (instrs MMX_CVTTPD2PIirr)>; +def : InstRW<[PdWriteMMX_CVTTPD2PIrr], (instrs MMX_CVTTPD2PIrr)>; // FIXME: f+3 ST, LD+STC latency defm : PdWriteResXMMPair<WriteCvtI2SS, [PdFPU0, PdFPCVT, PdFPSTO], 4, [], 2>; @@ -1048,18 +1048,18 @@ defm : PdWriteResXMMPair<WriteCvtPD2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, defm : PdWriteResYMMPair<WriteCvtPD2PSY, [PdFPU0, PdFPCVT, PdFPSTO, PdFPFMA], 8, [1, 2, 1, 1], 4>; defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; -def PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { +def PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 6; let NumMicroOps = 2; } -def : InstRW<[PdWriteMMX_CVTPD2PIirrMMX_CVTPI2PDirr], (instrs MMX_CVTPD2PIirr, - MMX_CVTPI2PDirr)>; +def : InstRW<[PdWriteMMX_CVTPD2PIrrMMX_CVTPI2PDrr], (instrs MMX_CVTPD2PIrr, + MMX_CVTPI2PDrr)>; -def PdWriteMMX_CVTPI2PSirr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { +def PdWriteMMX_CVTPI2PSrr : SchedWriteRes<[PdFPU0, PdFPCVT, PdFPSTO]> { let Latency = 4; let NumMicroOps = 2; } -def : InstRW<[PdWriteMMX_CVTPI2PSirr], (instrs MMX_CVTPI2PSirr)>; +def : InstRW<[PdWriteMMX_CVTPI2PSrr], (instrs MMX_CVTPI2PSrr)>; defm : PdWriteResXMMPair<WriteCvtPH2PS, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 2, 1>; defm : PdWriteResYMMPair<WriteCvtPH2PSY, [PdFPU0, PdFPCVT, PdFPSTO], 8, [1, 2, 1], 4, 3>; @@ -1365,7 +1365,7 @@ def PdWriteVZeroIdiomLogic : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>, SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogic]> ]>; -def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; +def : InstRW<[PdWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>; def PdWriteVZeroIdiomLogicX : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>, @@ -1378,11 +1378,11 @@ def PdWriteVZeroIdiomALU : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>, SchedVar<MCSchedPredicate<TruePred>, [WriteVecALU]> ]>; -def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, - MMX_PSUBQirr, MMX_PSUBWirr, - MMX_PCMPGTBirr, - MMX_PCMPGTDirr, - MMX_PCMPGTWirr)>; +def : InstRW<[PdWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr, + MMX_PSUBQrr, MMX_PSUBWrr, + MMX_PCMPGTBrr, + MMX_PCMPGTDrr, + MMX_PCMPGTWrr)>; def PdWriteVZeroIdiomALUX : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [PdWriteZeroLatency]>, @@ -1408,10 +1408,10 @@ def : IsZeroIdiomFunction<[ // MMX Zero-idioms. DepBreakingClass<[ - MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, - MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, - MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, - MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr + MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr, + MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr, + MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr, + MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr ], ZeroIdiomPredicate>, // SSE Zero-idioms. @@ -1449,7 +1449,7 @@ def : IsDepBreakingFunction<[ // MMX DepBreakingClass<[ - MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr + MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr ], ZeroIdiomPredicate>, // SSE diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 68ebaa244acf..a070da34cab5 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -888,7 +888,7 @@ def JWriteVZeroIdiomLogic : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, SchedVar<NoSchedPred, [WriteVecLogic]> ]>; -def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; +def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORrr, MMX_PANDNrr)>; def JWriteVZeroIdiomLogicX : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, @@ -901,12 +901,12 @@ def JWriteVZeroIdiomALU : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, SchedVar<NoSchedPred, [WriteVecALU]> ]>; -def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, - MMX_PSUBQirr, MMX_PSUBWirr, - MMX_PSUBSBirr, MMX_PSUBSWirr, - MMX_PSUBUSBirr, MMX_PSUBUSWirr, - MMX_PCMPGTBirr, MMX_PCMPGTDirr, - MMX_PCMPGTWirr)>; +def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBrr, MMX_PSUBDrr, + MMX_PSUBQrr, MMX_PSUBWrr, + MMX_PSUBSBrr, MMX_PSUBSWrr, + MMX_PSUBUSBrr, MMX_PSUBUSWrr, + MMX_PCMPGTBrr, MMX_PCMPGTDrr, + MMX_PCMPGTWrr)>; def JWriteVZeroIdiomALUX : SchedWriteVariant<[ SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, @@ -974,10 +974,10 @@ def : IsZeroIdiomFunction<[ // MMX Zero-idioms. DepBreakingClass<[ - MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, - MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, - MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, - MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr + MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr, + MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr, + MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr, + MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr ], ZeroIdiomPredicate>, // SSE Zero-idioms. @@ -1017,7 +1017,7 @@ def : IsDepBreakingFunction<[ // MMX DepBreakingClass<[ - MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr + MMX_PCMPEQBrr, MMX_PCMPEQDrr, MMX_PCMPEQWrr ], ZeroIdiomPredicate>, // SSE diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 5af9835f75a7..36e5b55a4194 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -467,8 +467,8 @@ def SLMWriteResGroup1rr : SchedWriteRes<[SLM_FPC_RSV01]> { let NumMicroOps = 2; let ResourceCycles = [8]; } -def: InstRW<[SLMWriteResGroup1rr], (instrs MMX_PADDQirr, PADDQrr, - MMX_PSUBQirr, PSUBQrr, +def: InstRW<[SLMWriteResGroup1rr], (instrs MMX_PADDQrr, PADDQrr, + MMX_PSUBQrr, PSUBQrr, PCMPEQQrr)>; def SLMWriteResGroup1rm : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV01]> { @@ -476,8 +476,8 @@ def SLMWriteResGroup1rm : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV01]> { let NumMicroOps = 3; let ResourceCycles = [1,8]; } -def: InstRW<[SLMWriteResGroup1rm], (instrs MMX_PADDQirm, PADDQrm, - MMX_PSUBQirm, PSUBQrm, +def: InstRW<[SLMWriteResGroup1rm], (instrs MMX_PADDQrm, PADDQrm, + MMX_PSUBQrm, PSUBQrm, PCMPEQQrm)>; } // SchedModel diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 8e30e5e10ca8..4343e1ed45d1 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -1000,12 +1000,12 @@ def ZnWriteFPU12Ym : SchedWriteRes<[ZnAGU, ZnFPU12]> { let NumMicroOps = 2; } -def : InstRW<[ZnWriteFPU12], (instrs MMX_PACKSSDWirr, - MMX_PACKSSWBirr, - MMX_PACKUSWBirr)>; -def : InstRW<[ZnWriteFPU12m], (instrs MMX_PACKSSDWirm, - MMX_PACKSSWBirm, - MMX_PACKUSWBirm)>; +def : InstRW<[ZnWriteFPU12], (instrs MMX_PACKSSDWrr, + MMX_PACKSSWBrr, + MMX_PACKUSWBrr)>; +def : InstRW<[ZnWriteFPU12m], (instrs MMX_PACKSSDWrm, + MMX_PACKSSWBrm, + MMX_PACKUSWBrm)>; def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ; def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> { @@ -1305,15 +1305,15 @@ def ZnWriteCVTPS2PIr: SchedWriteRes<[ZnFPU3]> { } // CVT(T)PS2PI. // mm,x. -def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>; +def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIrr")>; // CVTPI2PD. // x,mm. -def : InstRW<[ZnWriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>; +def : InstRW<[ZnWriteCVTPS2PDr], (instrs MMX_CVTPI2PDrr)>; // CVT(T)PD2PI. // mm,x. -def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>; +def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIrr")>; def ZnWriteCVSTSI2SSr: SchedWriteRes<[ZnFPU3]> { let Latency = 5; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td index a83c89e2f28a..96d2837880c7 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver2.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td @@ -1012,12 +1012,12 @@ def Zn2WriteFPU12Ym : SchedWriteRes<[Zn2AGU, Zn2FPU12]> { let NumMicroOps = 2; } -def : InstRW<[Zn2WriteFPU12], (instrs MMX_PACKSSDWirr, - MMX_PACKSSWBirr, - MMX_PACKUSWBirr)>; -def : InstRW<[Zn2WriteFPU12m], (instrs MMX_PACKSSDWirm, - MMX_PACKSSWBirm, - MMX_PACKUSWBirm)>; +def : InstRW<[Zn2WriteFPU12], (instrs MMX_PACKSSDWrr, + MMX_PACKSSWBrr, + MMX_PACKUSWBrr)>; +def : InstRW<[Zn2WriteFPU12m], (instrs MMX_PACKSSDWrm, + MMX_PACKSSWBrm, + MMX_PACKUSWBrm)>; def Zn2WriteFPU013 : SchedWriteRes<[Zn2FPU013]> ; def Zn2WriteFPU013Y : SchedWriteRes<[Zn2FPU013]> ; @@ -1304,15 +1304,15 @@ def Zn2WriteCVTPS2PIr: SchedWriteRes<[Zn2FPU3]> { } // CVT(T)PS2PI. // mm,x. -def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>; +def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIrr")>; // CVTPI2PD. // x,mm. -def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>; +def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDrr)>; // CVT(T)PD2PI. // mm,x. -def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>; +def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIrr")>; def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> { let Latency = 3; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td index be07c069aae1..f4e03ac11f0b 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -1075,9 +1075,9 @@ def Zn3WriteVecALUXMMX : SchedWriteRes<[Zn3FPVAdd01]> { } def : InstRW<[Zn3WriteVecALUXMMX], (instrs MMX_PABSBrr, MMX_PABSDrr, MMX_PABSWrr, MMX_PSIGNBrr, MMX_PSIGNDrr, MMX_PSIGNWrr, - MMX_PADDSBirr, MMX_PADDSWirr, MMX_PADDUSBirr, MMX_PADDUSWirr, - MMX_PAVGBirr, MMX_PAVGWirr, - MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr)>; + MMX_PADDSBrr, MMX_PADDSWrr, MMX_PADDUSBrr, MMX_PADDUSWrr, + MMX_PAVGBrr, MMX_PAVGWrr, + MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr)>; defm : Zn3WriteResYMMPair<WriteVecALUY, [Zn3FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM). @@ -1161,7 +1161,7 @@ def Zn3WriteCvtPD2IMMX : SchedWriteRes<[Zn3FPFCvt01]> { let ResourceCycles = [2]; let NumMicroOps = 2; } -def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIirm, MMX_CVTTPD2PIirm, MMX_CVTPD2PIirr, MMX_CVTTPD2PIirr)>; +def : InstRW<[Zn3WriteCvtPD2IMMX], (instrs MMX_CVTPD2PIrm, MMX_CVTTPD2PIrm, MMX_CVTPD2PIrr, MMX_CVTTPD2PIrr)>; defm : Zn3WriteResXMMPair<WriteCvtSS2I, [Zn3FPFCvt01], 2, [2], 2>; // Float -> Integer. @@ -1179,7 +1179,7 @@ def Zn3WriteCvtI2PDMMX : SchedWriteRes<[Zn3FPFCvt01]> { let ResourceCycles = [6]; let NumMicroOps = 2; } -def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDirm, MMX_CVTPI2PDirr)>; +def : InstRW<[Zn3WriteCvtI2PDMMX], (instrs MMX_CVTPI2PDrm, MMX_CVTPI2PDrr)>; defm : Zn3WriteResXMMPair<WriteCvtI2SS, [Zn3FPFCvt01], 3, [2], 2, /*LoadUOps=*/-1>; // Integer -> Float. defm : Zn3WriteResXMMPair<WriteCvtI2PS, [Zn3FPFCvt01], 3, [1], 1>; // Integer -> Float (XMM). @@ -1191,7 +1191,7 @@ def Zn3WriteCvtI2PSMMX : SchedWriteRes<[Zn3FPFCvt01]> { let ResourceCycles = [1]; let NumMicroOps = 2; } -def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSirr)>; +def : InstRW<[Zn3WriteCvtI2PSMMX], (instrs MMX_CVTPI2PSrr)>; defm : Zn3WriteResXMMPair<WriteCvtSS2SD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion. defm : Zn3WriteResXMMPair<WriteCvtPS2PD, [Zn3FPFCvt01], 3, [1], 1>; // Float -> Double size conversion (XMM). @@ -1621,7 +1621,7 @@ def : IsDepBreakingFunction<[ // MMX DepBreakingClass<[ - MMX_PCMPEQBirr, MMX_PCMPEQWirr, MMX_PCMPEQDirr + MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr ], ZeroIdiomPredicate>, // SSE diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 9da54dc2e9b7..5d773f0c57df 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -958,8 +958,7 @@ public: // extended frames should be flagged as present. const Triple &TT = getTargetTriple(); - unsigned Major, Minor, Micro; - TT.getOSVersion(Major, Minor, Micro); + unsigned Major = TT.getOSVersion().getMajor(); switch(TT.getOS()) { default: return false; diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 336985f3bf9d..78bc5519c23f 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -588,6 +588,18 @@ void X86PassConfig::addPreEmitPass2() { // Insert pseudo probe annotation for callsite profiling addPass(createPseudoProbeInserter()); + + // On Darwin platforms, BLR_RVMARKER pseudo instructions are lowered to + // bundles. + if (TT.isOSDarwin()) + addPass(createUnpackMachineBundles([](const MachineFunction &MF) { + // Only run bundle expansion if there are relevant ObjC runtime functions + // present in the module. + const Function &F = MF.getFunction(); + const Module *M = F.getParent(); + return M->getFunction("objc_retainAutoreleasedReturnValue") || + M->getFunction("objc_unsafeClaimAutoreleasedReturnValue"); + })); } bool X86PassConfig::addPostFastRegAllocRewrite() { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 869762b35196..d8cd7311a0d5 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -236,47 +236,50 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( } } - if ((ISD == ISD::MUL || ISD == ISD::SDIV || ISD == ISD::SREM || - ISD == ISD::UDIV || ISD == ISD::UREM) && + // Vector multiply by pow2 will be simplified to shifts. + if (ISD == ISD::MUL && (Op2Info == TargetTransformInfo::OK_UniformConstantValue || Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && - Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { - // Vector multiply by pow2 will be simplified to shifts. - if (ISD == ISD::MUL) { - InstructionCost Cost = getArithmeticInstrCost( - Instruction::Shl, Ty, CostKind, Op1Info, Op2Info, - TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); - return Cost; - } - - if (ISD == ISD::SDIV || ISD == ISD::SREM) { - // On X86, vector signed division by constants power-of-two are - // normally expanded to the sequence SRA + SRL + ADD + SRA. - // The OperandValue properties may not be the same as that of the previous - // operation; conservatively assume OP_None. - InstructionCost Cost = - 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, - Op2Info, TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, - Op2Info, TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); - Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, - Op2Info, TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) + return getArithmeticInstrCost(Instruction::Shl, Ty, CostKind, Op1Info, + Op2Info, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); - if (ISD == ISD::SREM) { - // For SREM: (X % C) is the equivalent of (X - (X/C)*C) - Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, - Op2Info); - Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, - Op2Info); - } + // On X86, vector signed division by constants power-of-two are + // normally expanded to the sequence SRA + SRL + ADD + SRA. + // The OperandValue properties may not be the same as that of the previous + // operation; conservatively assume OP_None. + if ((ISD == ISD::SDIV || ISD == ISD::SREM) && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + InstructionCost Cost = + 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, + Op2Info, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, + Op2Info, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, + Op2Info, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); - return Cost; + if (ISD == ISD::SREM) { + // For SREM: (X % C) is the equivalent of (X - (X/C)*C) + Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, + Op2Info); + Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, + Op2Info); } - // Vector unsigned division/remainder will be simplified to shifts/masks. + return Cost; + } + + // Vector unsigned division/remainder will be simplified to shifts/masks. + if ((ISD == ISD::UDIV || ISD == ISD::UREM) && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { if (ISD == ISD::UDIV) return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, Op2Info, TargetTransformInfo::OP_None, @@ -660,6 +663,7 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add + { ISD::MUL, MVT::i64, 1 }, // Skylake from http://www.agner.org/ { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ @@ -5188,10 +5192,10 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, return (RealCallerBits & RealCalleeBits) == RealCalleeBits; } -bool X86TTIImpl::areFunctionArgsABICompatible( - const Function *Caller, const Function *Callee, - SmallPtrSetImpl<Argument *> &Args) const { - if (!BaseT::areFunctionArgsABICompatible(Caller, Callee, Args)) +bool X86TTIImpl::areTypesABICompatible(const Function *Caller, + const Function *Callee, + const ArrayRef<Type *> &Types) const { + if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) return false; // If we get here, we know the target features match. If one function @@ -5206,13 +5210,8 @@ bool X86TTIImpl::areFunctionArgsABICompatible( // Consider the arguments compatible if they aren't vectors or aggregates. // FIXME: Look at the size of vectors. // FIXME: Look at the element types of aggregates to see if there are vectors. - // FIXME: The API of this function seems intended to allow arguments - // to be removed from the set, but the caller doesn't check if the set - // becomes empty so that may not work in practice. - return llvm::none_of(Args, [](Argument *A) { - auto *EltTy = cast<PointerType>(A->getType())->getElementType(); - return EltTy->isVectorTy() || EltTy->isAggregateType(); - }); + return llvm::none_of(Types, + [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); } X86TTIImpl::TTI::MemCmpExpansionOptions diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index c53424ec0026..11e9cb09c7d5 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -234,9 +234,8 @@ public: bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - bool areFunctionArgsABICompatible(const Function *Caller, - const Function *Callee, - SmallPtrSetImpl<Argument *> &Args) const; + bool areTypesABICompatible(const Function *Caller, const Function *Callee, + const ArrayRef<Type *> &Type) const; TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const; bool prefersVectorizedAddressing() const; diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp index abac3f801a22..4624b735bef8 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -475,12 +475,12 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) { // any of its operands, this way, when we get to the operand, we already // removed the instructions (from the expression dag) that uses it. CurrentTruncInst->eraseFromParent(); - for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) { + for (auto &I : llvm::reverse(InstInfoMap)) { // We still need to check that the instruction has no users before we erase // it, because {SExt, ZExt}Inst Instruction might have other users that was // not reduced, in such case, we need to keep that instruction. - if (I->first->use_empty()) - I->first->eraseFromParent(); + if (I.first->use_empty()) + I.first->eraseFromParent(); } } diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp index 96c083a144b2..5fc5295969d0 100644 --- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp +++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp @@ -165,6 +165,12 @@ void CFGuard::insertCFGuardCheck(CallBase *CB) { IRBuilder<> B(CB); Value *CalledOperand = CB->getCalledOperand(); + // If the indirect call is called within catchpad or cleanuppad, + // we need to copy "funclet" bundle of the call. + SmallVector<llvm::OperandBundleDef, 1> Bundles; + if (auto Bundle = CB->getOperandBundle(LLVMContext::OB_funclet)) + Bundles.push_back(OperandBundleDef(*Bundle)); + // Load the global symbol as a pointer to the check function. LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal); @@ -172,7 +178,7 @@ void CFGuard::insertCFGuardCheck(CallBase *CB) { // even if the original CallBase is an Invoke or CallBr instruction. CallInst *GuardCheck = B.CreateCall(GuardFnType, GuardCheckLoad, - {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}); + {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}, Bundles); // Ensure that the first argument is passed in the correct register // (e.g. ECX on 32-bit X86 targets). diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index ac3d078714ce..a0d12865bd3a 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1237,8 +1237,10 @@ namespace { struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { using Base = PtrUseVisitor<AllocaUseVisitor>; AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, - const CoroBeginInst &CB, const SuspendCrossingInfo &Checker) - : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker) {} + const CoroBeginInst &CB, const SuspendCrossingInfo &Checker, + bool ShouldUseLifetimeStartInfo) + : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker), + ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {} void visit(Instruction &I) { Users.insert(&I); @@ -1390,6 +1392,7 @@ private: SmallPtrSet<Instruction *, 4> Users{}; SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{}; bool MayWriteBeforeCoroBegin{false}; + bool ShouldUseLifetimeStartInfo{true}; mutable llvm::Optional<bool> ShouldLiveOnFrame{}; @@ -1398,7 +1401,7 @@ private: // more precise. We look at every pair of lifetime.start intrinsic and // every basic block that uses the pointer to see if they cross suspension // points. The uses cover both direct uses as well as indirect uses. - if (!LifetimeStarts.empty()) { + if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) { for (auto *I : Users) for (auto *S : LifetimeStarts) if (Checker.isDefinitionAcrossSuspend(*S, I)) @@ -2484,8 +2487,15 @@ static void collectFrameAllocas(Function &F, coro::Shape &Shape, continue; } DominatorTree DT(F); + // The code that uses lifetime.start intrinsic does not work for functions + // with loops without exit. Disable it on ABIs we know to generate such + // code. + bool ShouldUseLifetimeStartInfo = + (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && + Shape.ABI != coro::ABI::RetconOnce); AllocaUseVisitor Visitor{F.getParent()->getDataLayout(), DT, - *Shape.CoroBegin, Checker}; + *Shape.CoroBegin, Checker, + ShouldUseLifetimeStartInfo}; Visitor.visitPtr(*AI); if (!Visitor.getShouldLiveOnFrame()) continue; @@ -2572,9 +2582,15 @@ void coro::salvageDebugInfo( DVI->setExpression(Expr); /// It makes no sense to move the dbg.value intrinsic. if (!isa<DbgValueInst>(DVI)) { - if (auto *InsertPt = dyn_cast<Instruction>(Storage)) + if (auto *II = dyn_cast<InvokeInst>(Storage)) + DVI->moveBefore(II->getNormalDest()->getFirstNonPHI()); + else if (auto *CBI = dyn_cast<CallBrInst>(Storage)) + DVI->moveBefore(CBI->getDefaultDest()->getFirstNonPHI()); + else if (auto *InsertPt = dyn_cast<Instruction>(Storage)) { + assert(!InsertPt->isTerminator() && + "Unimaged terminator that could return a storage."); DVI->moveAfter(InsertPt); - else if (isa<Argument>(Storage)) + } else if (isa<Argument>(Storage)) DVI->moveAfter(F->getEntryBlock().getFirstNonPHI()); } } @@ -2664,7 +2680,10 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { } } - sinkLifetimeStartMarkers(F, Shape, Checker); + if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon && + Shape.ABI != coro::ABI::RetconOnce) + sinkLifetimeStartMarkers(F, Shape, Checker); + if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) collectFrameAllocas(F, Shape, Checker, FrameData.Allocas); LLVM_DEBUG(dumpAllocas(FrameData.Allocas)); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index fa1d92f439b8..12c1829524ef 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -280,6 +280,27 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End, BB->getTerminator()->eraseFromParent(); } +// Mark a coroutine as done, which implies that the coroutine is finished and +// never get resumed. +// +// In resume-switched ABI, the done state is represented by storing zero in +// ResumeFnAddr. +// +// NOTE: We couldn't omit the argument `FramePtr`. It is necessary because the +// pointer to the frame in splitted function is not stored in `Shape`. +static void markCoroutineAsDone(IRBuilder<> &Builder, const coro::Shape &Shape, + Value *FramePtr) { + assert( + Shape.ABI == coro::ABI::Switch && + "markCoroutineAsDone is only supported for Switch-Resumed ABI for now."); + auto *GepIndex = Builder.CreateStructGEP( + Shape.FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Resume, + "ResumeFn.addr"); + auto *NullPtr = ConstantPointerNull::get(cast<PointerType>( + Shape.FrameTy->getTypeAtIndex(coro::Shape::SwitchFieldIndex::Resume))); + Builder.CreateStore(NullPtr, GepIndex); +} + /// Replace an unwind call to llvm.coro.end. static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, Value *FramePtr, bool InResume, @@ -288,10 +309,18 @@ static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, switch (Shape.ABI) { // In switch-lowering, this does nothing in the main function. - case coro::ABI::Switch: + case coro::ABI::Switch: { + // In C++'s specification, the coroutine should be marked as done + // if promise.unhandled_exception() throws. The frontend will + // call coro.end(true) along this path. + // + // FIXME: We should refactor this once there is other language + // which uses Switch-Resumed style other than C++. + markCoroutineAsDone(Builder, Shape, FramePtr); if (!InResume) return; break; + } // In async lowering this does nothing. case coro::ABI::Async: break; @@ -364,13 +393,9 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) { auto *Save = S->getCoroSave(); Builder.SetInsertPoint(Save); if (S->isFinal()) { - // Final suspend point is represented by storing zero in ResumeFnAddr. - auto *GepIndex = Builder.CreateStructGEP(FrameTy, FramePtr, - coro::Shape::SwitchFieldIndex::Resume, - "ResumeFn.addr"); - auto *NullPtr = ConstantPointerNull::get(cast<PointerType>( - FrameTy->getTypeAtIndex(coro::Shape::SwitchFieldIndex::Resume))); - Builder.CreateStore(NullPtr, GepIndex); + // The coroutine should be marked done if it reaches the final suspend + // point. + markCoroutineAsDone(Builder, Shape, FramePtr); } else { auto *GepIndex = Builder.CreateStructGEP( FrameTy, FramePtr, Shape.getSwitchIndexField(), "index.addr"); diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index e4883ef89db7..fba8b03e44ba 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -141,7 +141,6 @@ static bool isCoroutineIntrinsicName(StringRef Name) { "llvm.coro.id.retcon", "llvm.coro.id.retcon.once", "llvm.coro.noop", - "llvm.coro.param", "llvm.coro.prepare.async", "llvm.coro.prepare.retcon", "llvm.coro.promise", diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 93bb11433775..3a42a2cac928 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -835,14 +835,20 @@ bool ArgumentPromotionPass::areFunctionArgsABICompatible( const Function &F, const TargetTransformInfo &TTI, SmallPtrSetImpl<Argument *> &ArgsToPromote, SmallPtrSetImpl<Argument *> &ByValArgsToTransform) { + // TODO: Check individual arguments so we can promote a subset? + SmallVector<Type *, 32> Types; + for (Argument *Arg : ArgsToPromote) + Types.push_back(Arg->getType()->getPointerElementType()); + for (Argument *Arg : ByValArgsToTransform) + Types.push_back(Arg->getParamByValType()); + for (const Use &U : F.uses()) { CallBase *CB = dyn_cast<CallBase>(U.getUser()); if (!CB) return false; const Function *Caller = CB->getCaller(); const Function *Callee = CB->getCalledFunction(); - if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) || - !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform)) + if (!TTI.areTypesABICompatible(Caller, Callee, Types)) return false; } return true; diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index edadc79e3a9f..7e729e57153c 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -2139,12 +2139,10 @@ bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) { bool Result = true; #ifndef NDEBUG if (SeedAllowList.size() != 0) - Result = - std::count(SeedAllowList.begin(), SeedAllowList.end(), AA.getName()); + Result = llvm::is_contained(SeedAllowList, AA.getName()); Function *Fn = AA.getAnchorScope(); if (FunctionSeedAllowList.size() != 0 && Fn) - Result &= std::count(FunctionSeedAllowList.begin(), - FunctionSeedAllowList.end(), Fn->getName()); + Result &= llvm::is_contained(FunctionSeedAllowList, Fn->getName()); #endif return Result; } diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index ec08287393de..b977821bcaa6 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -417,7 +417,7 @@ const Value *stripAndAccumulateMinimalOffsets( AttributorAnalysis); } -static const Value *getMinimalBaseOfAccsesPointerOperand( +static const Value *getMinimalBaseOfAccessPointerOperand( Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I, int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) { const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false); @@ -2129,7 +2129,7 @@ static int64_t getKnownNonNullAndDerefBytesForUse( int64_t Offset; const Value *Base = - getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL); + getMinimalBaseOfAccessPointerOperand(A, QueryingAA, I, Offset, DL); if (Base) { if (Base == &AssociatedValue && getPointerOperand(I, /* AllowVolatile */ false) == UseV) { @@ -6414,31 +6414,36 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { return indicatePessimisticFixpoint(); } + // Collect the types that will replace the privatizable type in the function + // signature. + SmallVector<Type *, 16> ReplacementTypes; + identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes); + // Verify callee and caller agree on how the promoted argument would be // passed. - // TODO: The use of the ArgumentPromotion interface here is ugly, we need a - // specialized form of TargetTransformInfo::areFunctionArgsABICompatible - // which doesn't require the arguments ArgumentPromotion wanted to pass. Function &Fn = *getIRPosition().getAnchorScope(); - SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy; - ArgsToPromote.insert(getAssociatedArgument()); const auto *TTI = A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn); - if (!TTI || - !ArgumentPromotionPass::areFunctionArgsABICompatible( - Fn, *TTI, ArgsToPromote, Dummy) || - ArgsToPromote.empty()) { + if (!TTI) { + LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Missing TTI for function " + << Fn.getName() << "\n"); + return indicatePessimisticFixpoint(); + } + + auto CallSiteCheck = [&](AbstractCallSite ACS) { + CallBase *CB = ACS.getInstruction(); + return TTI->areTypesABICompatible( + CB->getCaller(), CB->getCalledFunction(), ReplacementTypes); + }; + bool AllCallSitesKnown; + if (!A.checkForAllCallSites(CallSiteCheck, *this, true, + AllCallSitesKnown)) { LLVM_DEBUG( dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for " << Fn.getName() << "\n"); return indicatePessimisticFixpoint(); } - // Collect the types that will replace the privatizable type in the function - // signature. - SmallVector<Type *, 16> ReplacementTypes; - identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes); - // Register a rewrite of the argument. Argument *Arg = getAssociatedArgument(); if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) { @@ -6558,7 +6563,6 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { return false; }; - bool AllCallSitesKnown; if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true, AllCallSitesKnown)) return indicatePessimisticFixpoint(); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index cde78713b554..321d4a19a585 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -76,6 +76,7 @@ STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); STATISTIC(NumReturned, "Number of arguments marked returned"); STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); +STATISTIC(NumWriteOnlyArg, "Number of arguments marked writeonly"); STATISTIC(NumNoAlias, "Number of function returns marked noalias"); STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull"); STATISTIC(NumNoRecurse, "Number of functions marked as norecurse"); @@ -580,16 +581,8 @@ struct ArgumentUsesTracker : public CaptureTracker { return true; } - // Note: the callee and the two successor blocks *follow* the argument - // operands. This means there is no need to adjust UseIndex to account for - // these. - - unsigned UseIndex = - std::distance(const_cast<const Use *>(CB->arg_begin()), U); - - assert(UseIndex < CB->data_operands_size() && - "Indirect function calls should have been filtered above!"); - + assert(!CB->isCallee(U) && "callee operand reported captured?"); + const unsigned UseIndex = CB->getDataOperandNo(U); if (UseIndex >= CB->arg_size()) { // Data operand, but not a argument operand -- must be a bundle operand assert(CB->hasOperandBundles() && "Must be!"); @@ -649,8 +642,8 @@ struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> { /// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone. static Attribute::AttrKind -determinePointerReadAttrs(Argument *A, - const SmallPtrSet<Argument *, 8> &SCCNodes) { +determinePointerAccessAttrs(Argument *A, + const SmallPtrSet<Argument *, 8> &SCCNodes) { SmallVector<Use *, 32> Worklist; SmallPtrSet<Use *, 32> Visited; @@ -659,7 +652,7 @@ determinePointerReadAttrs(Argument *A, return Attribute::None; bool IsRead = false; - // We don't need to track IsWritten. If A is written to, return immediately. + bool IsWrite = false; for (Use &U : A->uses()) { Visited.insert(&U); @@ -667,6 +660,10 @@ determinePointerReadAttrs(Argument *A, } while (!Worklist.empty()) { + if (IsWrite && IsRead) + // No point in searching further.. + return Attribute::None; + Use *U = Worklist.pop_back_val(); Instruction *I = cast<Instruction>(U->getUser()); @@ -684,73 +681,49 @@ determinePointerReadAttrs(Argument *A, case Instruction::Call: case Instruction::Invoke: { - bool Captures = true; + CallBase &CB = cast<CallBase>(*I); + if (CB.isCallee(U)) { + IsRead = true; + // Note that indirect calls do not capture, see comment in + // CaptureTracking for context + continue; + } - if (I->getType()->isVoidTy()) - Captures = false; + // Given we've explictily handled the callee operand above, what's left + // must be a data operand (e.g. argument or operand bundle) + const unsigned UseIndex = CB.getDataOperandNo(U); - auto AddUsersToWorklistIfCapturing = [&] { - if (Captures) + if (!CB.doesNotCapture(UseIndex)) { + if (!CB.onlyReadsMemory()) + // If the callee can save a copy into other memory, then simply + // scanning uses of the call is insufficient. We have no way + // of tracking copies of the pointer through memory to see + // if a reloaded copy is written to, thus we must give up. + return Attribute::None; + // Push users for processing once we finish this one + if (!I->getType()->isVoidTy()) for (Use &UU : I->uses()) if (Visited.insert(&UU).second) Worklist.push_back(&UU); - }; - - CallBase &CB = cast<CallBase>(*I); - if (CB.doesNotAccessMemory()) { - AddUsersToWorklistIfCapturing(); - continue; } + + if (CB.doesNotAccessMemory()) + continue; - Function *F = CB.getCalledFunction(); - if (!F) { - if (CB.onlyReadsMemory()) { - IsRead = true; - AddUsersToWorklistIfCapturing(); - continue; - } - return Attribute::None; - } - - // Note: the callee and the two successor blocks *follow* the argument - // operands. This means there is no need to adjust UseIndex to account - // for these. - - unsigned UseIndex = std::distance(CB.arg_begin(), U); - - // U cannot be the callee operand use: since we're exploring the - // transitive uses of an Argument, having such a use be a callee would - // imply the call site is an indirect call or invoke; and we'd take the - // early exit above. - assert(UseIndex < CB.data_operands_size() && - "Data operand use expected!"); - - bool IsOperandBundleUse = UseIndex >= CB.arg_size(); + if (Function *F = CB.getCalledFunction()) + if (CB.isArgOperand(U) && UseIndex < F->arg_size() && + SCCNodes.count(F->getArg(UseIndex))) + // This is an argument which is part of the speculative SCC. Note + // that only operands corresponding to formal arguments of the callee + // can participate in the speculation. + break; - if (UseIndex >= F->arg_size() && !IsOperandBundleUse) { - assert(F->isVarArg() && "More params than args in non-varargs call"); + // The accessors used on call site here do the right thing for calls and + // invokes with operand bundles. + if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex)) return Attribute::None; - } - - Captures &= !CB.doesNotCapture(UseIndex); - - // Since the optimizer (by design) cannot see the data flow corresponding - // to a operand bundle use, these cannot participate in the optimistic SCC - // analysis. Instead, we model the operand bundle uses as arguments in - // call to a function external to the SCC. - if (IsOperandBundleUse || - !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) { - - // The accessors used on call site here do the right thing for calls and - // invokes with operand bundles. - - if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex)) - return Attribute::None; - if (!CB.doesNotAccessMemory(UseIndex)) - IsRead = true; - } - - AddUsersToWorklistIfCapturing(); + if (!CB.doesNotAccessMemory(UseIndex)) + IsRead = true; break; } @@ -763,6 +736,19 @@ determinePointerReadAttrs(Argument *A, IsRead = true; break; + case Instruction::Store: + if (cast<StoreInst>(I)->getValueOperand() == *U) + // untrackable capture + return Attribute::None; + + // A volatile store has side effects beyond what writeonly can be relied + // upon. + if (cast<StoreInst>(I)->isVolatile()) + return Attribute::None; + + IsWrite = true; + break; + case Instruction::ICmp: case Instruction::Ret: break; @@ -772,7 +758,14 @@ determinePointerReadAttrs(Argument *A, } } - return IsRead ? Attribute::ReadOnly : Attribute::ReadNone; + if (IsWrite && IsRead) + return Attribute::None; + else if (IsRead) + return Attribute::ReadOnly; + else if (IsWrite) + return Attribute::WriteOnly; + else + return Attribute::ReadNone; } /// Deduce returned attributes for the SCC. @@ -865,9 +858,10 @@ static bool addArgumentAttrsFromCallsites(Function &F) { return Changed; } -static bool addReadAttr(Argument *A, Attribute::AttrKind R) { - assert((R == Attribute::ReadOnly || R == Attribute::ReadNone) - && "Must be a Read attribute."); +static bool addAccessAttr(Argument *A, Attribute::AttrKind R) { + assert((R == Attribute::ReadOnly || R == Attribute::ReadNone || + R == Attribute::WriteOnly) + && "Must be an access attribute."); assert(A && "Argument must not be null."); // If the argument already has the attribute, nothing needs to be done. @@ -880,7 +874,12 @@ static bool addReadAttr(Argument *A, Attribute::AttrKind R) { A->removeAttr(Attribute::ReadOnly); A->removeAttr(Attribute::ReadNone); A->addAttr(R); - R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; + if (R == Attribute::ReadOnly) + ++NumReadOnlyArg; + else if (R == Attribute::WriteOnly) + ++NumWriteOnlyArg; + else + ++NumReadNoneArg; return true; } @@ -945,15 +944,15 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // Otherwise, it's captured. Don't bother doing SCC analysis on it. } if (!HasNonLocalUses && !A->onlyReadsMemory()) { - // Can we determine that it's readonly/readnone without doing an SCC? - // Note that we don't allow any calls at all here, or else our result - // will be dependent on the iteration order through the functions in the - // SCC. + // Can we determine that it's readonly/readnone/writeonly without doing + // an SCC? Note that we don't allow any calls at all here, or else our + // result will be dependent on the iteration order through the + // functions in the SCC. SmallPtrSet<Argument *, 8> Self; Self.insert(&*A); - Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self); + Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); if (R != Attribute::None) - if (addReadAttr(A, R)) + if (addAccessAttr(A, R)) Changed.insert(F); } } @@ -979,6 +978,13 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, A->addAttr(Attribute::NoCapture); ++NumNoCapture; Changed.insert(A->getParent()); + + // Infer the access attributes given the new nocapture one + SmallPtrSet<Argument *, 8> Self; + Self.insert(&*A); + Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); + if (R != Attribute::None) + addAccessAttr(A, R); } continue; } @@ -1023,10 +1029,10 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, Changed.insert(A->getParent()); } - // We also want to compute readonly/readnone. With a small number of false - // negatives, we can assume that any pointer which is captured isn't going - // to be provably readonly or readnone, since by definition we can't - // analyze all uses of a captured pointer. + // We also want to compute readonly/readnone/writeonly. With a small number + // of false negatives, we can assume that any pointer which is captured + // isn't going to be provably readonly or readnone, since by definition + // we can't analyze all uses of a captured pointer. // // The false negatives happen when the pointer is captured by a function // that promises readonly/readnone behaviour on the pointer, then the @@ -1034,24 +1040,28 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // Also, a readonly/readnone pointer may be returned, but returning a // pointer is capturing it. - Attribute::AttrKind ReadAttr = Attribute::ReadNone; - for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { + auto meetAccessAttr = [](Attribute::AttrKind A, Attribute::AttrKind B) { + if (A == B) + return A; + if (A == Attribute::ReadNone) + return B; + if (B == Attribute::ReadNone) + return A; + return Attribute::None; + }; + + Attribute::AttrKind AccessAttr = Attribute::ReadNone; + for (unsigned i = 0, e = ArgumentSCC.size(); + i != e && AccessAttr != Attribute::None; ++i) { Argument *A = ArgumentSCC[i]->Definition; - Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes); - if (K == Attribute::ReadNone) - continue; - if (K == Attribute::ReadOnly) { - ReadAttr = Attribute::ReadOnly; - continue; - } - ReadAttr = K; - break; + Attribute::AttrKind K = determinePointerAccessAttrs(A, ArgumentSCCNodes); + AccessAttr = meetAccessAttr(AccessAttr, K); } - if (ReadAttr != Attribute::None) { + if (AccessAttr != Attribute::None) { for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; - if (addReadAttr(A, ReadAttr)) + if (addAccessAttr(A, AccessAttr)) Changed.insert(A->getParent()); } } diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index fbd083bb9bbf..2425646455bd 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -64,8 +64,8 @@ static cl::opt<unsigned> FuncSpecializationMaxIters( cl::desc("The maximum number of iterations function specialization is run"), cl::init(1)); -static cl::opt<unsigned> MaxConstantsThreshold( - "func-specialization-max-constants", cl::Hidden, +static cl::opt<unsigned> MaxClonesThreshold( + "func-specialization-max-clones", cl::Hidden, cl::desc("The maximum number of clones allowed for a single function " "specialization"), cl::init(3)); @@ -92,6 +92,28 @@ static cl::opt<bool> EnableSpecializationForLiteralConstant( cl::desc("Enable specialization of functions that take a literal constant " "as an argument.")); +namespace { +// Bookkeeping struct to pass data from the analysis and profitability phase +// to the actual transform helper functions. +struct ArgInfo { + Function *Fn; // The function to perform specialisation on. + Argument *Arg; // The Formal argument being analysed. + Constant *Const; // A corresponding actual constant argument. + InstructionCost Gain; // Profitability: Gain = Bonus - Cost. + + // Flag if this will be a partial specialization, in which case we will need + // to keep the original function around in addition to the added + // specializations. + bool Partial = false; + + ArgInfo(Function *F, Argument *A, Constant *C, InstructionCost G) + : Fn(F), Arg(A), Const(C), Gain(G){}; +}; +} // Anonymous namespace + +using FuncList = SmallVectorImpl<Function *>; +using ConstList = SmallVectorImpl<Constant *>; + // Helper to check if \p LV is either a constant or a constant // range with a single element. This should cover exactly the same cases as the // old ValueLatticeElement::isConstant() and is intended to be used in the @@ -169,7 +191,7 @@ static Constant *getConstantStackValue(CallInst *Call, Value *Val, // ret void // } // -static void constantArgPropagation(SmallVectorImpl<Function *> &WorkList, +static void constantArgPropagation(FuncList &WorkList, Module &M, SCCPSolver &Solver) { // Iterate over the argument tracked functions see if there // are any new constant values for the call instruction via @@ -254,40 +276,33 @@ public: /// /// \returns true if at least one function is specialized. bool - specializeFunctions(SmallVectorImpl<Function *> &FuncDecls, - SmallVectorImpl<Function *> &CurrentSpecializations) { - - // Attempt to specialize the argument-tracked functions. + specializeFunctions(FuncList &FuncDecls, + FuncList &CurrentSpecializations) { bool Changed = false; for (auto *F : FuncDecls) { - if (specializeFunction(F, CurrentSpecializations)) { - Changed = true; - LLVM_DEBUG(dbgs() << "FnSpecialization: Can specialize this func.\n"); - } else { + if (!isCandidateFunction(F, CurrentSpecializations)) + continue; + + auto Cost = getSpecializationCost(F); + if (!Cost.isValid()) { LLVM_DEBUG( - dbgs() << "FnSpecialization: Cannot specialize this func.\n"); + dbgs() << "FnSpecialization: Invalid specialisation cost.\n"); + continue; } - } - for (auto *SpecializedFunc : CurrentSpecializations) { - SpecializedFuncs.insert(SpecializedFunc); - - // Initialize the state of the newly created functions, marking them - // argument-tracked and executable. - if (SpecializedFunc->hasExactDefinition() && - !SpecializedFunc->hasFnAttribute(Attribute::Naked)) - Solver.addTrackedFunction(SpecializedFunc); - Solver.addArgumentTrackedFunction(SpecializedFunc); - FuncDecls.push_back(SpecializedFunc); - Solver.markBlockExecutable(&SpecializedFunc->front()); + auto ConstArgs = calculateGains(F, Cost); + if (ConstArgs.empty()) { + LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n"); + continue; + } - // Replace the function arguments for the specialized functions. - for (Argument &Arg : SpecializedFunc->args()) - if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg)) - LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: " - << Arg.getName() << "\n"); + for (auto &CA : ConstArgs) { + specializeFunction(CA, CurrentSpecializations); + Changed = true; + } } + updateSpecializedFuncs(FuncDecls, CurrentSpecializations); NumFuncSpecialized += NbFunctionsSpecialized; return Changed; } @@ -333,15 +348,83 @@ private: return Clone; } - /// This function decides whether to specialize function \p F based on the - /// known constant values its arguments can take on. Specialization is - /// performed on the first interesting argument. Specializations based on - /// additional arguments will be evaluated on following iterations of the - /// main IPSCCP solve loop. \returns true if the function is specialized and - /// false otherwise. - bool specializeFunction(Function *F, - SmallVectorImpl<Function *> &Specializations) { + /// This function decides whether it's worthwhile to specialize function \p F + /// based on the known constant values its arguments can take on, i.e. it + /// calculates a gain and returns a list of actual arguments that are deemed + /// profitable to specialize. Specialization is performed on the first + /// interesting argument. Specializations based on additional arguments will + /// be evaluated on following iterations of the main IPSCCP solve loop. + SmallVector<ArgInfo> calculateGains(Function *F, InstructionCost Cost) { + SmallVector<ArgInfo> Worklist; + // Determine if we should specialize the function based on the values the + // argument can take on. If specialization is not profitable, we continue + // on to the next argument. + for (Argument &FormalArg : F->args()) { + LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: " + << FormalArg.getName() << "\n"); + // Determine if this argument is interesting. If we know the argument can + // take on any constant values, they are collected in Constants. If the + // argument can only ever equal a constant value in Constants, the + // function will be completely specialized, and the IsPartial flag will + // be set to false by isArgumentInteresting (that function only adds + // values to the Constants list that are deemed profitable). + bool IsPartial = true; + SmallVector<Constant *> ActualConstArg; + if (!isArgumentInteresting(&FormalArg, ActualConstArg, IsPartial)) { + LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n"); + continue; + } + + for (auto *ActualArg : ActualConstArg) { + InstructionCost Gain = + ForceFunctionSpecialization + ? 1 + : getSpecializationBonus(&FormalArg, ActualArg) - Cost; + if (Gain <= 0) + continue; + Worklist.push_back({F, &FormalArg, ActualArg, Gain}); + } + + if (Worklist.empty()) + continue; + + // Sort the candidates in descending order. + llvm::stable_sort(Worklist, [](const ArgInfo &L, const ArgInfo &R) { + return L.Gain > R.Gain; + }); + + // Truncate the worklist to 'MaxClonesThreshold' candidates if + // necessary. + if (Worklist.size() > MaxClonesThreshold) { + LLVM_DEBUG(dbgs() << "FnSpecialization: number of candidates exceed " + << "the maximum number of clones threshold.\n" + << "Truncating worklist to " << MaxClonesThreshold + << " candidates.\n"); + Worklist.erase(Worklist.begin() + MaxClonesThreshold, + Worklist.end()); + } + + if (IsPartial || Worklist.size() < ActualConstArg.size()) + for (auto &ActualArg : Worklist) + ActualArg.Partial = true; + + LLVM_DEBUG(dbgs() << "Sorted list of candidates by gain:\n"; + for (auto &C + : Worklist) { + dbgs() << "- Function = " << C.Fn->getName() << ", "; + dbgs() << "FormalArg = " << C.Arg->getName() << ", "; + dbgs() << "ActualArg = " << C.Const->getName() << ", "; + dbgs() << "Gain = " << C.Gain << "\n"; + }); + + // FIXME: Only one argument per function. + break; + } + return Worklist; + } + + bool isCandidateFunction(Function *F, FuncList &Specializations) { // Do not specialize the cloned function again. if (SpecializedFuncs.contains(F)) return false; @@ -362,84 +445,32 @@ private: LLVM_DEBUG(dbgs() << "FnSpecialization: Try function: " << F->getName() << "\n"); + return true; + } - // Determine if it would be profitable to create a specialization of the - // function where the argument takes on the given constant value. If so, - // add the constant to Constants. - auto FnSpecCost = getSpecializationCost(F); - if (!FnSpecCost.isValid()) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Invalid specialisation cost.\n"); - return false; - } - - LLVM_DEBUG(dbgs() << "FnSpecialization: func specialisation cost: "; - FnSpecCost.print(dbgs()); dbgs() << "\n"); - - // Determine if we should specialize the function based on the values the - // argument can take on. If specialization is not profitable, we continue - // on to the next argument. - for (Argument &A : F->args()) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing arg: " << A.getName() - << "\n"); - // True if this will be a partial specialization. We will need to keep - // the original function around in addition to the added specializations. - bool IsPartial = true; - - // Determine if this argument is interesting. If we know the argument can - // take on any constant values, they are collected in Constants. If the - // argument can only ever equal a constant value in Constants, the - // function will be completely specialized, and the IsPartial flag will - // be set to false by isArgumentInteresting (that function only adds - // values to the Constants list that are deemed profitable). - SmallVector<Constant *, 4> Constants; - if (!isArgumentInteresting(&A, Constants, FnSpecCost, IsPartial)) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is not interesting\n"); - continue; - } - - assert(!Constants.empty() && "No constants on which to specialize"); - LLVM_DEBUG(dbgs() << "FnSpecialization: Argument is interesting!\n" - << "FnSpecialization: Specializing '" << F->getName() - << "' on argument: " << A << "\n" - << "FnSpecialization: Constants are:\n\n"; - for (unsigned I = 0; I < Constants.size(); ++I) dbgs() - << *Constants[I] << "\n"; - dbgs() << "FnSpecialization: End of constants\n\n"); - - // Create a version of the function in which the argument is marked - // constant with the given value. - for (auto *C : Constants) { - // Clone the function. We leave the ValueToValueMap empty to allow - // IPSCCP to propagate the constant arguments. - Function *Clone = cloneCandidateFunction(F); - Argument *ClonedArg = Clone->arg_begin() + A.getArgNo(); - - // Rewrite calls to the function so that they call the clone instead. - rewriteCallSites(F, Clone, *ClonedArg, C); + void specializeFunction(ArgInfo &AI, FuncList &Specializations) { + Function *Clone = cloneCandidateFunction(AI.Fn); + Argument *ClonedArg = Clone->getArg(AI.Arg->getArgNo()); - // Initialize the lattice state of the arguments of the function clone, - // marking the argument on which we specialized the function constant - // with the given value. - Solver.markArgInFuncSpecialization(F, ClonedArg, C); + // Rewrite calls to the function so that they call the clone instead. + rewriteCallSites(AI.Fn, Clone, *ClonedArg, AI.Const); - // Mark all the specialized functions - Specializations.push_back(Clone); - NbFunctionsSpecialized++; - } + // Initialize the lattice state of the arguments of the function clone, + // marking the argument on which we specialized the function constant + // with the given value. + Solver.markArgInFuncSpecialization(AI.Fn, ClonedArg, AI.Const); - // If the function has been completely specialized, the original function - // is no longer needed. Mark it unreachable. - if (!IsPartial) - Solver.markFunctionUnreachable(F); + // Mark all the specialized functions + Specializations.push_back(Clone); + NbFunctionsSpecialized++; - // FIXME: Only one argument per function. - return true; - } - - return false; + // If the function has been completely specialized, the original function + // is no longer needed. Mark it unreachable. + if (!AI.Partial) + Solver.markFunctionUnreachable(AI.Fn); } - /// Compute the cost of specializing function \p F. + /// Compute and return the cost of specializing function \p F. InstructionCost getSpecializationCost(Function *F) { // Compute the code metrics for the function. SmallPtrSet<const Value *, 32> EphValues; @@ -578,9 +609,7 @@ private: /// /// \returns true if the function should be specialized on the given /// argument. - bool isArgumentInteresting(Argument *A, - SmallVectorImpl<Constant *> &Constants, - const InstructionCost &FnSpecCost, + bool isArgumentInteresting(Argument *A, ConstList &Constants, bool &IsPartial) { // For now, don't attempt to specialize functions based on the values of // composite types. @@ -608,42 +637,8 @@ private: // // TODO 2: this currently does not support constants, i.e. integer ranges. // - SmallVector<Constant *, 4> PossibleConstants; - bool AllConstant = getPossibleConstants(A, PossibleConstants); - if (PossibleConstants.empty()) { - LLVM_DEBUG(dbgs() << "FnSpecialization: no possible constants found\n"); - return false; - } - if (PossibleConstants.size() > MaxConstantsThreshold) { - LLVM_DEBUG(dbgs() << "FnSpecialization: number of constants found exceed " - << "the maximum number of constants threshold.\n"); - return false; - } - - for (auto *C : PossibleConstants) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Constant: " << *C << "\n"); - if (ForceFunctionSpecialization) { - LLVM_DEBUG(dbgs() << "FnSpecialization: Forced!\n"); - Constants.push_back(C); - continue; - } - if (getSpecializationBonus(A, C) > FnSpecCost) { - LLVM_DEBUG(dbgs() << "FnSpecialization: profitable!\n"); - Constants.push_back(C); - } else { - LLVM_DEBUG(dbgs() << "FnSpecialization: not profitable\n"); - } - } - - // None of the constant values the argument can take on were deemed good - // candidates on which to specialize the function. - if (Constants.empty()) - return false; - - // This will be a partial specialization if some of the constants were - // rejected due to their profitability. - IsPartial = !AllConstant || PossibleConstants.size() != Constants.size(); - + IsPartial = !getPossibleConstants(A, Constants); + LLVM_DEBUG(dbgs() << "FnSpecialization: interesting arg: " << *A << "\n"); return true; } @@ -653,8 +648,7 @@ private: /// \returns true if all of the values the argument can take on are constant /// (e.g., the argument's parent function cannot be called with an /// overdefined value). - bool getPossibleConstants(Argument *A, - SmallVectorImpl<Constant *> &Constants) { + bool getPossibleConstants(Argument *A, ConstList &Constants) { Function *F = A->getParent(); bool AllConstant = true; @@ -681,7 +675,7 @@ private: // For now, constant expressions are fine but only if they are function // calls. - if (auto *CE = dyn_cast<ConstantExpr>(V)) + if (auto *CE = dyn_cast<ConstantExpr>(V)) if (!isa<Function>(CE->getOperand(0))) return false; @@ -737,6 +731,29 @@ private: } } } + + void updateSpecializedFuncs(FuncList &FuncDecls, + FuncList &CurrentSpecializations) { + for (auto *SpecializedFunc : CurrentSpecializations) { + SpecializedFuncs.insert(SpecializedFunc); + + // Initialize the state of the newly created functions, marking them + // argument-tracked and executable. + if (SpecializedFunc->hasExactDefinition() && + !SpecializedFunc->hasFnAttribute(Attribute::Naked)) + Solver.addTrackedFunction(SpecializedFunc); + + Solver.addArgumentTrackedFunction(SpecializedFunc); + FuncDecls.push_back(SpecializedFunc); + Solver.markBlockExecutable(&SpecializedFunc->front()); + + // Replace the function arguments for the specialized functions. + for (Argument &Arg : SpecializedFunc->args()) + if (!Arg.use_empty() && tryToReplaceWithConstant(&Arg)) + LLVM_DEBUG(dbgs() << "FnSpecialization: Replaced constant argument: " + << Arg.getName() << "\n"); + } + } }; } // namespace diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index ba7589c2bf60..b1f3ff15c97b 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -305,8 +305,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, else if (auto *LI = dyn_cast<LoadInst>(U)) { // A load from zeroinitializer is always zeroinitializer, regardless of // any applied offset. - if (Init->isNullValue()) { - LI->replaceAllUsesWith(Constant::getNullValue(LI->getType())); + Type *Ty = LI->getType(); + if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) { + LI->replaceAllUsesWith(Constant::getNullValue(Ty)); EraseFromParent(LI); continue; } @@ -316,8 +317,7 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV, PtrOp = PtrOp->stripAndAccumulateConstantOffsets( DL, Offset, /* AllowNonInbounds */ true); if (PtrOp == GV) { - if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(), - Offset, DL)) { + if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) { LI->replaceAllUsesWith(Value); EraseFromParent(LI); } @@ -368,8 +368,7 @@ static bool isSafeSROAGEP(User *U) { return false; } - return llvm::all_of(U->users(), - [](User *UU) { return isSafeSROAElementUse(UU); }); + return llvm::all_of(U->users(), isSafeSROAElementUse); } /// Return true if the specified instruction is a safe user of a derived diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index 833049d6896f..a964fcde0396 100644 --- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -294,7 +294,7 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region, // Find all incoming values from the outlining region. int NumIncomingVals = 0; for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i) - if (find(Region, PN.getIncomingBlock(i)) != Region.end()) { + if (llvm::is_contained(Region, PN.getIncomingBlock(i))) { ++NumIncomingVals; if (NumIncomingVals > 1) { ++NumSplitExitPhis; diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp index 992c2b292e1e..4e3689f09536 100644 --- a/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/llvm/lib/Transforms/IPO/Inliner.cpp @@ -856,6 +856,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, if (InlineHistoryID != -1 && inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) { + LLVM_DEBUG(dbgs() << "Skipping inlining due to history: " + << F.getName() << " -> " << Callee.getName() << "\n"); setInlineRemark(*CB, "recursive"); continue; } diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index f78971f0e586..c0bb19e184d6 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1774,8 +1774,9 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical) { SmallSetVector<Constant *, 4> Constants; for (Use &U : llvm::make_early_inc_range(Old->uses())) { - // Skip block addresses - if (isa<BlockAddress>(U.getUser())) + // Skip block addresses and no_cfi values, which refer to the function + // body instead of the jump table. + if (isa<BlockAddress, NoCFIValue>(U.getUser())) continue; // Skip direct calls to externally defined or non-dso_local functions @@ -1802,7 +1803,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, } void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) { - Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); }); + Old->replaceUsesWithIf(New, isDirectCall); } bool LowerTypeTestsModule::lower() { diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 055ee6b50296..f289e3ecc979 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -3964,6 +3964,9 @@ struct AAKernelInfoCallSite : AAKernelInfo { case OMPRTL___kmpc_master: case OMPRTL___kmpc_end_master: case OMPRTL___kmpc_barrier: + case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2: + case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2: + case OMPRTL___kmpc_nvptx_end_reduce_nowait: break; case OMPRTL___kmpc_distribute_static_init_4: case OMPRTL___kmpc_distribute_static_init_4u: @@ -4010,6 +4013,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { break; case OMPRTL___kmpc_omp_task: // We do not look into tasks right now, just give up. + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); ReachedUnknownParallelRegions.insert(&CB); break; @@ -4020,6 +4024,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { default: // Unknown OpenMP runtime calls cannot be executed in SPMD-mode, // generally. However, they do not hide parallel regions. + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); break; } @@ -4079,6 +4084,7 @@ struct AAKernelInfoCallSite : AAKernelInfo { SPMDCompatibilityTracker.insert(&CB); break; default: + SPMDCompatibilityTracker.indicatePessimisticFixpoint(); SPMDCompatibilityTracker.insert(&CB); } diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp index bae9a1e27e75..7334bf695b67 100644 --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -32,7 +32,7 @@ ContextTrieNode *ContextTrieNode::getChildContext(const LineLocation &CallSite, if (CalleeName.empty()) return getHottestChildContext(CallSite); - uint64_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); if (It != AllChildContext.end()) return &It->second; @@ -65,7 +65,8 @@ ContextTrieNode::getHottestChildContext(const LineLocation &CallSite) { ContextTrieNode &ContextTrieNode::moveToChildContext( const LineLocation &CallSite, ContextTrieNode &&NodeToMove, uint32_t ContextFramesToRemove, bool DeleteNode) { - uint64_t Hash = nodeHash(NodeToMove.getFuncName(), CallSite); + uint64_t Hash = + FunctionSamples::getCallSiteHash(NodeToMove.getFuncName(), CallSite); assert(!AllChildContext.count(Hash) && "Node to remove must exist"); LineLocation OldCallSite = NodeToMove.CallSiteLoc; ContextTrieNode &OldParentContext = *NodeToMove.getParentContext(); @@ -108,7 +109,7 @@ ContextTrieNode &ContextTrieNode::moveToChildContext( void ContextTrieNode::removeChildContext(const LineLocation &CallSite, StringRef CalleeName) { - uint64_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite); // Note this essentially calls dtor and destroys that child context AllChildContext.erase(Hash); } @@ -174,21 +175,9 @@ void ContextTrieNode::dumpTree() { } } -uint64_t ContextTrieNode::nodeHash(StringRef ChildName, - const LineLocation &Callsite) { - // We still use child's name for child hash, this is - // because for children of root node, we don't have - // different line/discriminator, and we'll rely on name - // to differentiate children. - uint64_t NameHash = std::hash<std::string>{}(ChildName.str()); - uint64_t LocId = - (((uint64_t)Callsite.LineOffset) << 32) | Callsite.Discriminator; - return NameHash + (LocId << 5) + LocId; -} - ContextTrieNode *ContextTrieNode::getOrCreateChildContext( const LineLocation &CallSite, StringRef CalleeName, bool AllowCreate) { - uint64_t Hash = nodeHash(CalleeName, CallSite); + uint64_t Hash = FunctionSamples::getCallSiteHash(CalleeName, CallSite); auto It = AllChildContext.find(Hash); if (It != AllChildContext.end()) { assert(It->second.getFuncName() == CalleeName && diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index b8fac9d47763..bc6051de90c4 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -467,6 +467,9 @@ protected: void emitOptimizationRemarksForInlineCandidates( const SmallVectorImpl<CallBase *> &Candidates, const Function &F, bool Hot); + void promoteMergeNotInlinedContextSamples( + DenseMap<CallBase *, const FunctionSamples *> NonInlinedCallSites, + const Function &F); std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG); std::unique_ptr<ProfiledCallGraph> buildProfiledCallGraph(CallGraph &CG); void generateMDProfMetadata(Function &F); @@ -485,7 +488,7 @@ protected: std::unique_ptr<SampleContextTracker> ContextTracker; /// Flag indicating whether input profile is context-sensitive - bool ProfileIsCS = false; + bool ProfileIsCSFlat = false; /// Flag indicating which LTO/ThinLTO phase the pass is invoked in. /// @@ -602,7 +605,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) { // call instruction should have 0 count. // For CS profile, the callsite count of previously inlined callees is // populated with the entry count of the callees. - if (!ProfileIsCS) + if (!ProfileIsCSFlat) if (const auto *CB = dyn_cast<CallBase>(&Inst)) if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB)) return 0; @@ -641,7 +644,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) { // call instruction should have 0 count. // For CS profile, the callsite count of previously inlined callees is // populated with the entry count of the callees. - if (!ProfileIsCS) + if (!ProfileIsCSFlat) if (const auto *CB = dyn_cast<CallBase>(&Inst)) if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB)) return 0; @@ -695,7 +698,7 @@ SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const { if (Function *Callee = Inst.getCalledFunction()) CalleeName = Callee->getName(); - if (ProfileIsCS) + if (ProfileIsCSFlat) return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName); const FunctionSamples *FS = findFunctionSamples(Inst); @@ -727,7 +730,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples( FunctionSamples::getGUID(R->getName()); }; - if (ProfileIsCS) { + if (ProfileIsCSFlat) { auto CalleeSamples = ContextTracker->getIndirectCalleeContextSamplesFor(DIL); if (CalleeSamples.empty()) @@ -780,7 +783,7 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const { auto it = DILocation2SampleMap.try_emplace(DIL,nullptr); if (it.second) { - if (ProfileIsCS) + if (ProfileIsCSFlat) it.first->second = ContextTracker->getContextSamplesFor(DIL); else it.first->second = @@ -1039,7 +1042,7 @@ void SampleProfileLoader::findExternalInlineCandidate( // For AutoFDO profile, retrieve candidate profiles by walking over // the nested inlinee profiles. - if (!ProfileIsCS) { + if (!ProfileIsCSFlat) { Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold); return; } @@ -1134,7 +1137,7 @@ bool SampleProfileLoader::inlineHotFunctions( assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) && "GUIDToFuncNameMap has to be populated"); AllCandidates.push_back(CB); - if (FS->getEntrySamples() > 0 || ProfileIsCS) + if (FS->getEntrySamples() > 0 || ProfileIsCSFlat) LocalNotInlinedCallSites.try_emplace(CB, FS); if (callsiteIsHot(FS, PSI, ProfAccForSymsInList)) Hot = true; @@ -1156,11 +1159,9 @@ bool SampleProfileLoader::inlineHotFunctions( } for (CallBase *I : CIS) { Function *CalledFunction = I->getCalledFunction(); - InlineCandidate Candidate = { - I, - LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I] - : nullptr, - 0 /* dummy count */, 1.0 /* dummy distribution factor */}; + InlineCandidate Candidate = {I, LocalNotInlinedCallSites.lookup(I), + 0 /* dummy count */, + 1.0 /* dummy distribution factor */}; // Do not inline recursive calls. if (CalledFunction == &F) continue; @@ -1198,53 +1199,9 @@ bool SampleProfileLoader::inlineHotFunctions( } // For CS profile, profile for not inlined context will be merged when - // base profile is being trieved - if (ProfileIsCS) - return Changed; - - // Accumulate not inlined callsite information into notInlinedSamples - for (const auto &Pair : LocalNotInlinedCallSites) { - CallBase *I = Pair.getFirst(); - Function *Callee = I->getCalledFunction(); - if (!Callee || Callee->isDeclaration()) - continue; - - ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline", - I->getDebugLoc(), I->getParent()) - << "previous inlining not repeated: '" - << ore::NV("Callee", Callee) << "' into '" - << ore::NV("Caller", &F) << "'"); - - ++NumCSNotInlined; - const FunctionSamples *FS = Pair.getSecond(); - if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { - continue; - } - - if (ProfileMergeInlinee) { - // A function call can be replicated by optimizations like callsite - // splitting or jump threading and the replicates end up sharing the - // sample nested callee profile instead of slicing the original inlinee's - // profile. We want to do merge exactly once by filtering out callee - // profiles with a non-zero head sample count. - if (FS->getHeadSamples() == 0) { - // Use entry samples as head samples during the merge, as inlinees - // don't have head samples. - const_cast<FunctionSamples *>(FS)->addHeadSamples( - FS->getEntrySamples()); - - // Note that we have to do the merge right after processing function. - // This allows OutlineFS's profile to be used for annotation during - // top-down processing of functions' annotation. - FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); - OutlineFS->merge(*FS); - } - } else { - auto pair = - notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); - pair.first->second.entryCount += FS->getEntrySamples(); - } - } + // base profile is being retrieved. + if (!FunctionSamples::ProfileIsCSFlat) + promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F); return Changed; } @@ -1285,7 +1242,7 @@ bool SampleProfileLoader::tryInlineCandidate( InlinedCallSites->push_back(I); } - if (ProfileIsCS) + if (ProfileIsCSFlat) ContextTracker->markContextSamplesInlined(Candidate.CalleeSamples); ++NumCSInlined; @@ -1430,7 +1387,6 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) { bool SampleProfileLoader::inlineHotFunctionsWithPriority( Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { - assert(ProfileIsCS && "Prioritiy based inliner only works with CSSPGO now"); // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure // Profile symbol list is ignored when profile-sample-accurate is on. @@ -1467,6 +1423,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( if (ExternalInlineAdvisor) SizeLimit = std::numeric_limits<unsigned>::max(); + DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites; + // Perform iterative BFS call site prioritized inlining bool Changed = false; while (!CQueue.empty() && F.getInstructionCount() < SizeLimit) { @@ -1521,6 +1479,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( } ICPCount++; Changed = true; + } else if (!ContextTracker) { + LocalNotInlinedCallSites.try_emplace(I, FS); } } } else if (CalledFunction && CalledFunction->getSubprogram() && @@ -1532,6 +1492,8 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( CQueue.emplace(NewCandidate); } Changed = true; + } else if (!ContextTracker) { + LocalNotInlinedCallSites.try_emplace(I, Candidate.CalleeSamples); } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { findExternalInlineCandidate(I, findCalleeFunctionSamples(*I), @@ -1549,9 +1511,63 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority( ++NumCSInlinedHitGrowthLimit; } + // For CS profile, profile for not inlined context will be merged when + // base profile is being retrieved. + if (!FunctionSamples::ProfileIsCSFlat) + promoteMergeNotInlinedContextSamples(LocalNotInlinedCallSites, F); return Changed; } +void SampleProfileLoader::promoteMergeNotInlinedContextSamples( + DenseMap<CallBase *, const FunctionSamples *> NonInlinedCallSites, + const Function &F) { + // Accumulate not inlined callsite information into notInlinedSamples + for (const auto &Pair : NonInlinedCallSites) { + CallBase *I = Pair.getFirst(); + Function *Callee = I->getCalledFunction(); + if (!Callee || Callee->isDeclaration()) + continue; + + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline", + I->getDebugLoc(), I->getParent()) + << "previous inlining not repeated: '" + << ore::NV("Callee", Callee) << "' into '" + << ore::NV("Caller", &F) << "'"); + + ++NumCSNotInlined; + const FunctionSamples *FS = Pair.getSecond(); + if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { + continue; + } + + if (ProfileMergeInlinee) { + // A function call can be replicated by optimizations like callsite + // splitting or jump threading and the replicates end up sharing the + // sample nested callee profile instead of slicing the original + // inlinee's profile. We want to do merge exactly once by filtering out + // callee profiles with a non-zero head sample count. + if (FS->getHeadSamples() == 0) { + // Use entry samples as head samples during the merge, as inlinees + // don't have head samples. + const_cast<FunctionSamples *>(FS)->addHeadSamples( + FS->getEntrySamples()); + + // Note that we have to do the merge right after processing function. + // This allows OutlineFS's profile to be used for annotation during + // top-down processing of functions' annotation. + FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); + OutlineFS->merge(*FS, 1); + // Set outlined profile to be synthetic to not bias the inliner. + OutlineFS->SetContextSynthetic(); + } + } else { + auto pair = + notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); + pair.first->second.entryCount += FS->getEntrySamples(); + } + } +} + /// Returns the sorted CallTargetMap \p M by count in descending order. static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets(const SampleRecord::CallTargetMap &M) { @@ -1607,7 +1623,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) { // With CSSPGO all indirect call targets are counted torwards the // original indirect call site in the profile, including both // inlined and non-inlined targets. - if (!FunctionSamples::ProfileIsCS) { + if (!FunctionSamples::ProfileIsCSFlat) { if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) { for (const auto &NameFS : *M) @@ -1754,7 +1770,7 @@ bool SampleProfileLoader::emitAnnotations(Function &F) { } DenseSet<GlobalValue::GUID> InlinedGUIDs; - if (ProfileIsCS && CallsitePrioritizedInline) + if (CallsitePrioritizedInline) Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs); else Changed |= inlineHotFunctions(F, InlinedGUIDs); @@ -1782,7 +1798,7 @@ INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", std::unique_ptr<ProfiledCallGraph> SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) { std::unique_ptr<ProfiledCallGraph> ProfiledCG; - if (ProfileIsCS) + if (ProfileIsCSFlat) ProfiledCG = std::make_unique<ProfiledCallGraph>(*ContextTracker); else ProfiledCG = std::make_unique<ProfiledCallGraph>(Reader->getProfiles()); @@ -1828,7 +1844,7 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { assert(&CG->getModule() == &M); if (UseProfiledCallGraph || - (ProfileIsCS && !UseProfiledCallGraph.getNumOccurrences())) { + (ProfileIsCSFlat && !UseProfiledCallGraph.getNumOccurrences())) { // Use profiled call edges to augment the top-down order. There are cases // that the top-down order computed based on the static call graph doesn't // reflect real execution order. For example @@ -1961,10 +1977,8 @@ bool SampleProfileLoader::doInitialization(Module &M, } // Apply tweaks if context-sensitive profile is available. - if (Reader->profileIsCS()) { - ProfileIsCS = true; - FunctionSamples::ProfileIsCS = true; - + if (Reader->profileIsCSFlat() || Reader->profileIsCSNested()) { + ProfileIsCSFlat = Reader->profileIsCSFlat(); // Enable priority-base inliner and size inline by default for CSSPGO. if (!ProfileSizeInline.getNumOccurrences()) ProfileSizeInline = true; @@ -1982,10 +1996,15 @@ bool SampleProfileLoader::doInitialization(Module &M, // Enable iterative-BFI by default for CSSPGO. if (!UseIterativeBFIInference.getNumOccurrences()) UseIterativeBFIInference = true; + // Enable Profi by default for CSSPGO. + if (!SampleProfileUseProfi.getNumOccurrences()) + SampleProfileUseProfi = true; - // Tracker for profiles under different context - ContextTracker = std::make_unique<SampleContextTracker>( - Reader->getProfiles(), &GUIDToFuncNameMap); + if (FunctionSamples::ProfileIsCSFlat) { + // Tracker for profiles under different context + ContextTracker = std::make_unique<SampleContextTracker>( + Reader->getProfiles(), &GUIDToFuncNameMap); + } } // Load pseudo probe descriptors for probe-based function samples. @@ -1994,7 +2013,8 @@ bool SampleProfileLoader::doInitialization(Module &M, if (!ProbeManager->moduleIsProbed(M)) { const char *Msg = "Pseudo-probe-based profile requires SampleProfileProbePass"; - Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); + Ctx.diagnose(DiagnosticInfoSampleProfile(M.getModuleIdentifier(), Msg, + DS_Warning)); return false; } } @@ -2062,7 +2082,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, } // Account for cold calls not inlined.... - if (!ProfileIsCS) + if (!ProfileIsCSFlat) for (const std::pair<Function *, NotInlinedProfileInfo> &pair : notInlinedCallInfo) updateProfileCallee(pair.first, pair.second.entryCount); @@ -2138,7 +2158,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) ORE = OwnedORE.get(); } - if (ProfileIsCS) + if (ProfileIsCSFlat) Samples = ContextTracker->getBaseSamplesFor(F); else Samples = Reader->getSamplesFor(F); diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 0cc1b37844f6..daaf6cbeb3fd 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -87,7 +87,8 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId, if (isa<Function>(&ExportGV) && allowPromotionAlias(OldName)) { // Create a local alias with the original name to avoid breaking // references from inline assembly. - std::string Alias = ".set " + OldName + "," + NewName + "\n"; + std::string Alias = + ".lto_set_conditional " + OldName + "," + NewName + "\n"; ExportM.appendModuleInlineAsm(Alias); } } diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 61054e7ae46f..6acace1d9fd4 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -359,6 +359,36 @@ template <> struct DenseMapInfo<VTableSlotSummary> { namespace { +// Returns true if the function must be unreachable based on ValueInfo. +// +// In particular, identifies a function as unreachable in the following +// conditions +// 1) All summaries are live. +// 2) All function summaries indicate it's unreachable +bool mustBeUnreachableFunction(ValueInfo TheFnVI) { + if ((!TheFnVI) || TheFnVI.getSummaryList().empty()) { + // Returns false if ValueInfo is absent, or the summary list is empty + // (e.g., function declarations). + return false; + } + + for (auto &Summary : TheFnVI.getSummaryList()) { + // Conservatively returns false if any non-live functions are seen. + // In general either all summaries should be live or all should be dead. + if (!Summary->isLive()) + return false; + if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) { + if (!FS->fflags().MustBeUnreachable) + return false; + } + // Do nothing if a non-function has the same GUID (which is rare). + // This is correct since non-function summaries are not relevant. + } + // All function summaries are live and all of them agree that the function is + // unreachble. + return true; +} + // A virtual call site. VTable is the loaded virtual table pointer, and CS is // the indirect virtual call. struct VirtualCallSite { @@ -562,10 +592,12 @@ struct DevirtModule { void buildTypeIdentifierMap( std::vector<VTableBits> &Bits, DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap); + bool tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot, const std::set<TypeMemberInfo> &TypeMemberInfos, - uint64_t ByteOffset); + uint64_t ByteOffset, + ModuleSummaryIndex *ExportSummary); void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn, bool &IsExported); @@ -640,6 +672,23 @@ struct DevirtModule { bool run(); + // Look up the corresponding ValueInfo entry of `TheFn` in `ExportSummary`. + // + // Caller guarantees that `ExportSummary` is not nullptr. + static ValueInfo lookUpFunctionValueInfo(Function *TheFn, + ModuleSummaryIndex *ExportSummary); + + // Returns true if the function definition must be unreachable. + // + // Note if this helper function returns true, `F` is guaranteed + // to be unreachable; if it returns false, `F` might still + // be unreachable but not covered by this helper function. + // + // Implementation-wise, if function definition is present, IR is analyzed; if + // not, look up function flags from ExportSummary as a fallback. + static bool mustBeUnreachableFunction(Function *const F, + ModuleSummaryIndex *ExportSummary); + // Lower the module using the action and summary passed as command line // arguments. For testing purposes only. static bool @@ -969,7 +1018,8 @@ void DevirtModule::buildTypeIdentifierMap( bool DevirtModule::tryFindVirtualCallTargets( std::vector<VirtualCallTarget> &TargetsForSlot, - const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) { + const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset, + ModuleSummaryIndex *ExportSummary) { for (const TypeMemberInfo &TM : TypeMemberInfos) { if (!TM.Bits->GV->isConstant()) return false; @@ -997,6 +1047,11 @@ bool DevirtModule::tryFindVirtualCallTargets( if (Fn->getName() == "__cxa_pure_virtual") continue; + // We can disregard unreachable functions as possible call targets, as + // unreachable functions shouldn't be called. + if (mustBeUnreachableFunction(Fn, ExportSummary)) + continue; + TargetsForSlot.push_back({Fn, &TM}); } @@ -1053,6 +1108,9 @@ bool DevirtIndex::tryFindVirtualCallTargets( if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset) continue; + if (mustBeUnreachableFunction(VTP.FuncVI)) + continue; + TargetsForSlot.push_back(VTP.FuncVI); } } @@ -1744,7 +1802,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { GlobalVariable::PrivateLinkage, NewInit, "", B.GV); NewGV->setSection(B.GV->getSection()); NewGV->setComdat(B.GV->getComdat()); - NewGV->setAlignment(MaybeAlign(B.GV->getAlignment())); + NewGV->setAlignment(B.GV->getAlign()); // Copy the original vtable's metadata to the anonymous global, adjusting // offsets as required. @@ -2014,6 +2072,44 @@ void DevirtModule::removeRedundantTypeTests() { } } +ValueInfo +DevirtModule::lookUpFunctionValueInfo(Function *TheFn, + ModuleSummaryIndex *ExportSummary) { + assert((ExportSummary != nullptr) && + "Caller guarantees ExportSummary is not nullptr"); + + const auto TheFnGUID = TheFn->getGUID(); + const auto TheFnGUIDWithExportedName = GlobalValue::getGUID(TheFn->getName()); + // Look up ValueInfo with the GUID in the current linkage. + ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFnGUID); + // If no entry is found and GUID is different from GUID computed using + // exported name, look up ValueInfo with the exported name unconditionally. + // This is a fallback. + // + // The reason to have a fallback: + // 1. LTO could enable global value internalization via + // `enable-lto-internalization`. + // 2. The GUID in ExportedSummary is computed using exported name. + if ((!TheFnVI) && (TheFnGUID != TheFnGUIDWithExportedName)) { + TheFnVI = ExportSummary->getValueInfo(TheFnGUIDWithExportedName); + } + return TheFnVI; +} + +bool DevirtModule::mustBeUnreachableFunction( + Function *const F, ModuleSummaryIndex *ExportSummary) { + // First, learn unreachability by analyzing function IR. + if (!F->isDeclaration()) { + // A function must be unreachable if its entry block ends with an + // 'unreachable'. + return isa<UnreachableInst>(F->getEntryBlock().getTerminator()); + } + // Learn unreachability from ExportSummary if ExportSummary is present. + return ExportSummary && + ::mustBeUnreachableFunction( + DevirtModule::lookUpFunctionValueInfo(F, ExportSummary)); +} + bool DevirtModule::run() { // If only some of the modules were split, we cannot correctly perform // this transformation. We already checked for the presense of type tests @@ -2137,7 +2233,7 @@ bool DevirtModule::run() { cast<MDString>(S.first.TypeID)->getString()) .WPDRes[S.first.ByteOffset]; if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos, - S.first.ByteOffset)) { + S.first.ByteOffset, ExportSummary)) { if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) { DidVirtualConstProp |= diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index dc55b5a31596..de1034c910d5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -1795,6 +1795,55 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I, } } + // (~A & B & C) | ... --> ... + // (~A | B | C) | ... --> ... + // TODO: One use checks are conservative. We just need to check that a total + // number of multiple used values does not exceed reduction + // in operations. + if (match(Op0, + m_OneUse(m_c_BinOp(FlippedOpcode, + m_BinOp(FlippedOpcode, m_Value(B), m_Value(C)), + m_CombineAnd(m_Value(X), m_Not(m_Value(A)))))) || + match(Op0, m_OneUse(m_c_BinOp( + FlippedOpcode, + m_c_BinOp(FlippedOpcode, m_Value(C), + m_CombineAnd(m_Value(X), m_Not(m_Value(A)))), + m_Value(B))))) { + // X = ~A + // (~A & B & C) | ~(A | B | C) --> ~(A | (B ^ C)) + // (~A | B | C) & ~(A & B & C) --> (~A | (B ^ C)) + if (match(Op1, m_OneUse(m_Not(m_c_BinOp( + Opcode, m_c_BinOp(Opcode, m_Specific(A), m_Specific(B)), + m_Specific(C))))) || + match(Op1, m_OneUse(m_Not(m_c_BinOp( + Opcode, m_c_BinOp(Opcode, m_Specific(B), m_Specific(C)), + m_Specific(A))))) || + match(Op1, m_OneUse(m_Not(m_c_BinOp( + Opcode, m_c_BinOp(Opcode, m_Specific(A), m_Specific(C)), + m_Specific(B)))))) { + Value *Xor = Builder.CreateXor(B, C); + return (Opcode == Instruction::Or) + ? BinaryOperator::CreateNot(Builder.CreateOr(Xor, A)) + : BinaryOperator::CreateOr(Xor, X); + } + + // (~A & B & C) | ~(A | B) --> (C | ~B) & ~A + // (~A | B | C) & ~(A & B) --> (C & ~B) | ~A + if (match(Op1, m_OneUse(m_Not(m_OneUse( + m_c_BinOp(Opcode, m_Specific(A), m_Specific(B))))))) + return BinaryOperator::Create( + FlippedOpcode, Builder.CreateBinOp(Opcode, C, Builder.CreateNot(B)), + X); + + // (~A & B & C) | ~(A | C) --> (B | ~C) & ~A + // (~A | B | C) & ~(A & C) --> (B & ~C) | ~A + if (match(Op1, m_OneUse(m_Not(m_OneUse( + m_c_BinOp(Opcode, m_Specific(A), m_Specific(C))))))) + return BinaryOperator::Create( + FlippedOpcode, Builder.CreateBinOp(Opcode, B, Builder.CreateNot(C)), + X); + } + return nullptr; } @@ -2102,6 +2151,15 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) { Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg"); return SelectInst::Create(Cmp, Y, Zero); } + // If there's a 'not' of the shifted value, swap the select operands: + // ~(iN X s>> (N-1)) & Y --> (X s< 0) ? 0 : Y + if (match(&I, m_c_And(m_OneUse(m_Not( + m_AShr(m_Value(X), m_SpecificInt(FullShift)))), + m_Value(Y)))) { + Constant *Zero = ConstantInt::getNullValue(Ty); + Value *Cmp = Builder.CreateICmpSLT(X, Zero, "isneg"); + return SelectInst::Create(Cmp, Zero, Y); + } // (~x) & y --> ~(x | (~y)) iff that gets rid of inversions if (sinkNotIntoOtherHandOfAndOrOr(I)) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 7da2669e1d13..14427bd1f2f4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -2472,6 +2472,12 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call, Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) { if (!CI->getCalledFunction()) return nullptr; + // Skip optimizing notail and musttail calls so + // LibCallSimplifier::optimizeCall doesn't have to preserve those invariants. + // LibCallSimplifier::optimizeCall should try to preseve tail calls though. + if (CI->isMustTailCall() || CI->isNoTailCall()) + return nullptr; + auto InstCombineRAUW = [this](Instruction *From, Value *With) { replaceInstUsesWith(*From, With); }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 33f217659c01..8df4a4529f47 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -157,7 +157,7 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI, Amt = Builder.CreateAdd(Amt, Off); } - AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt); + AllocaInst *New = Builder.CreateAlloca(CastElTy, AI.getAddressSpace(), Amt); New->setAlignment(AI.getAlign()); New->takeName(&AI); New->setUsedWithInAlloca(AI.isUsedWithInAlloca()); @@ -965,13 +965,13 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { if (match(Src, m_VScale(DL))) { if (Trunc.getFunction() && Trunc.getFunction()->hasFnAttribute(Attribute::VScaleRange)) { - unsigned MaxVScale = Trunc.getFunction() - ->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - if (MaxVScale > 0 && Log2_32(MaxVScale) < DestWidth) { - Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); - return replaceInstUsesWith(Trunc, VScale); + Attribute Attr = + Trunc.getFunction()->getFnAttribute(Attribute::VScaleRange); + if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) { + if (Log2_32(MaxVScale.getValue()) < DestWidth) { + Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); + return replaceInstUsesWith(Trunc, VScale); + } } } } @@ -1337,14 +1337,13 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) { if (match(Src, m_VScale(DL))) { if (CI.getFunction() && CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) { - unsigned MaxVScale = CI.getFunction() - ->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - unsigned TypeWidth = Src->getType()->getScalarSizeInBits(); - if (MaxVScale > 0 && Log2_32(MaxVScale) < TypeWidth) { - Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); - return replaceInstUsesWith(CI, VScale); + Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange); + if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) { + unsigned TypeWidth = Src->getType()->getScalarSizeInBits(); + if (Log2_32(MaxVScale.getValue()) < TypeWidth) { + Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); + return replaceInstUsesWith(CI, VScale); + } } } } @@ -1608,13 +1607,12 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) { if (match(Src, m_VScale(DL))) { if (CI.getFunction() && CI.getFunction()->hasFnAttribute(Attribute::VScaleRange)) { - unsigned MaxVScale = CI.getFunction() - ->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - if (MaxVScale > 0 && Log2_32(MaxVScale) < (SrcBitSize - 1)) { - Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); - return replaceInstUsesWith(CI, VScale); + Attribute Attr = CI.getFunction()->getFnAttribute(Attribute::VScaleRange); + if (Optional<unsigned> MaxVScale = Attr.getVScaleRangeMax()) { + if (Log2_32(MaxVScale.getValue()) < (SrcBitSize - 1)) { + Value *VScale = Builder.CreateVScale(ConstantInt::get(DestTy, 1)); + return replaceInstUsesWith(CI, VScale); + } } } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 20c75188ec9f..39b55b028110 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -600,6 +600,7 @@ public: /// Canonicalize the position of binops relative to shufflevector. Instruction *foldVectorBinop(BinaryOperator &Inst); Instruction *foldVectorSelect(SelectInst &Sel); + Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf); /// Given a binary operator, cast instruction, or select which has a PHI node /// as operand #0, see if we can fold the instruction into the PHI (which is diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 79a8a065d02a..0dbfdba353c4 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -163,7 +163,7 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI, uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType()); if (!AllocaSize) return false; - return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()), + return isDereferenceableAndAlignedPointer(V, AI->getAlign(), APInt(64, AllocaSize), DL); } @@ -183,7 +183,8 @@ static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC, if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { if (C->getValue().getActiveBits() <= 64) { Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); - AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName()); + AllocaInst *New = IC.Builder.CreateAlloca(NewTy, AI.getAddressSpace(), + nullptr, AI.getName()); New->setAlignment(AI.getAlign()); // Scan to the end of the allocation instructions, to skip over a block of @@ -199,21 +200,13 @@ static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC, Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType()); Value *NullIdx = Constant::getNullValue(IdxTy); Value *Idx[2] = {NullIdx, NullIdx}; - Instruction *NewI = GetElementPtrInst::CreateInBounds( + Instruction *GEP = GetElementPtrInst::CreateInBounds( NewTy, New, Idx, New->getName() + ".sub"); - IC.InsertNewInstBefore(NewI, *It); - - // Gracefully handle allocas in other address spaces. - if (AI.getType()->getPointerAddressSpace() != - NewI->getType()->getPointerAddressSpace()) { - NewI = - CastInst::CreatePointerBitCastOrAddrSpaceCast(NewI, AI.getType()); - IC.InsertNewInstBefore(NewI, *It); - } + IC.InsertNewInstBefore(GEP, *It); // Now make everything use the getelementptr instead of the original // allocation. - return IC.replaceInstUsesWith(AI, NewI); + return IC.replaceInstUsesWith(AI, GEP); } } @@ -640,7 +633,6 @@ static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) { return nullptr; StringRef Name = LI.getName(); - assert(LI.getAlignment() && "Alignment must be set at this point"); if (auto *ST = dyn_cast<StructType>(T)) { // If the struct only have one element, we unpack. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 779d298da7a4..aca7ec8d7325 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -755,6 +755,15 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) { if (simplifyDivRemOfSelectWithZeroOp(I)) return &I; + // If the divisor is a select-of-constants, try to constant fold all div ops: + // C / (select Cond, TrueC, FalseC) --> select Cond, (C / TrueC), (C / FalseC) + // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds. + if (match(Op0, m_ImmConstant()) && + match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) { + if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1))) + return R; + } + const APInt *C2; if (match(Op1, m_APInt(C2))) { Value *X; @@ -1461,6 +1470,15 @@ Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) { if (simplifyDivRemOfSelectWithZeroOp(I)) return &I; + // If the divisor is a select-of-constants, try to constant fold all rem ops: + // C % (select Cond, TrueC, FalseC) --> select Cond, (C % TrueC), (C % FalseC) + // TODO: Adapt simplifyDivRemOfSelectWithZeroOp to allow this and other folds. + if (match(Op0, m_ImmConstant()) && + match(Op1, m_Select(m_Value(), m_ImmConstant(), m_ImmConstant()))) { + if (Instruction *R = FoldOpIntoSelect(I, cast<SelectInst>(Op1))) + return R; + } + if (isa<Constant>(Op1)) { if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) { if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 35739c3b9a21..30f6aab2114b 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -664,10 +664,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { return nullptr; // When processing loads, we need to propagate two bits of information to the - // sunk load: whether it is volatile, and what its alignment is. We currently - // don't sink loads when some have their alignment specified and some don't. - // visitLoadInst will propagate an alignment onto the load when TD is around, - // and if TD isn't around, we can't handle the mixed case. + // sunk load: whether it is volatile, and what its alignment is. bool isVolatile = FirstLI->isVolatile(); Align LoadAlignment = FirstLI->getAlign(); unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace(); @@ -699,7 +696,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) { !isSafeAndProfitableToSinkLoad(LI)) return nullptr; - LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign())); + LoadAlignment = std::min(LoadAlignment, LI->getAlign()); // If the PHI is of volatile loads and the load block has multiple // successors, sinking it would remove a load of the volatile value from diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 518d3952dce5..a6d6b5199105 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1482,7 +1482,12 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, if (C0->getType() != Sel.getType()) return nullptr; - // FIXME: are there any magic icmp predicate+constant pairs we must not touch? + // ULT with 'add' of a constant is canonical. See foldICmpAddConstant(). + // FIXME: Are there more magic icmp predicate+constant pairs we must avoid? + // Or should we just abandon this transform entirely? + if (Pred == CmpInst::ICMP_ULT && match(X, m_Add(m_Value(), m_Constant()))) + return nullptr; + Value *SelVal0, *SelVal1; // We do not care which one is from where. match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1))); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index e357a9da8b12..4dc712f32536 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1595,12 +1595,6 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, simplifyAndSetOp(I, 0, DemandedElts, UndefElts); simplifyAndSetOp(I, 1, DemandedElts, UndefElts2); - // Any change to an instruction with potential poison must clear those flags - // because we can not guarantee those constraints now. Other analysis may - // determine that it is safe to re-apply the flags. - if (MadeChange) - BO->dropPoisonGeneratingFlags(); - // Output elements are undefined if both are undefined. Consider things // like undef & 0. The result is known zero, not undef. UndefElts &= UndefElts2; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 32e537897140..c6a4602e59e3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -363,6 +363,18 @@ static APInt findDemandedEltsByAllUsers(Value *V) { return UnionUsedElts; } +/// Given a constant index for a extractelement or insertelement instruction, +/// return it with the canonical type if it isn't already canonical. We +/// arbitrarily pick 64 bit as our canonical type. The actual bitwidth doesn't +/// matter, we just want a consistent type to simplify CSE. +ConstantInt *getPreferredVectorIndex(ConstantInt *IndexC) { + const unsigned IndexBW = IndexC->getType()->getBitWidth(); + if (IndexBW == 64 || IndexC->getValue().getActiveBits() > 64) + return nullptr; + return ConstantInt::get(IndexC->getContext(), + IndexC->getValue().zextOrTrunc(64)); +} + Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { Value *SrcVec = EI.getVectorOperand(); Value *Index = EI.getIndexOperand(); @@ -374,6 +386,10 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { // find a previously computed scalar that was inserted into the vector. auto *IndexC = dyn_cast<ConstantInt>(Index); if (IndexC) { + // Canonicalize type of constant indices to i64 to simplify CSE + if (auto *NewIdx = getPreferredVectorIndex(IndexC)) + return replaceOperand(EI, 1, NewIdx); + ElementCount EC = EI.getVectorOperandType()->getElementCount(); unsigned NumElts = EC.getKnownMinValue(); @@ -401,37 +417,6 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { if (!EC.isScalable() && IndexC->getValue().uge(NumElts)) return nullptr; - // This instruction only demands the single element from the input vector. - // Skip for scalable type, the number of elements is unknown at - // compile-time. - if (!EC.isScalable() && NumElts != 1) { - // If the input vector has a single use, simplify it based on this use - // property. - if (SrcVec->hasOneUse()) { - APInt UndefElts(NumElts, 0); - APInt DemandedElts(NumElts, 0); - DemandedElts.setBit(IndexC->getZExtValue()); - if (Value *V = - SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) - return replaceOperand(EI, 0, V); - } else { - // If the input vector has multiple uses, simplify it based on a union - // of all elements used. - APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec); - if (!DemandedElts.isAllOnes()) { - APInt UndefElts(NumElts, 0); - if (Value *V = SimplifyDemandedVectorElts( - SrcVec, DemandedElts, UndefElts, 0 /* Depth */, - true /* AllowMultipleUsers */)) { - if (V != SrcVec) { - SrcVec->replaceAllUsesWith(V); - return &EI; - } - } - } - } - } - if (Instruction *I = foldBitcastExtElt(EI)) return I; @@ -473,11 +458,9 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { if (auto *I = dyn_cast<Instruction>(SrcVec)) { if (auto *IE = dyn_cast<InsertElementInst>(I)) { - // Extracting the inserted element? - if (IE->getOperand(2) == Index) - return replaceInstUsesWith(EI, IE->getOperand(1)); - // If the inserted and extracted elements are constants, they must not - // be the same value, extract from the pre-inserted value instead. + // instsimplify already handled the case where the indices are constants + // and equal by value, if both are constants, they must not be the same + // value, extract from the pre-inserted value instead. if (isa<Constant>(IE->getOperand(2)) && IndexC) return replaceOperand(EI, 0, IE->getOperand(0)); } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { @@ -497,30 +480,27 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { llvm::count_if(GEP->operands(), [](const Value *V) { return isa<VectorType>(V->getType()); }); - if (VectorOps > 1) - return nullptr; - assert(VectorOps == 1 && "Expected exactly one vector GEP operand!"); + if (VectorOps == 1) { + Value *NewPtr = GEP->getPointerOperand(); + if (isa<VectorType>(NewPtr->getType())) + NewPtr = Builder.CreateExtractElement(NewPtr, IndexC); - Value *NewPtr = GEP->getPointerOperand(); - if (isa<VectorType>(NewPtr->getType())) - NewPtr = Builder.CreateExtractElement(NewPtr, IndexC); + SmallVector<Value *> NewOps; + for (unsigned I = 1; I != GEP->getNumOperands(); ++I) { + Value *Op = GEP->getOperand(I); + if (isa<VectorType>(Op->getType())) + NewOps.push_back(Builder.CreateExtractElement(Op, IndexC)); + else + NewOps.push_back(Op); + } - SmallVector<Value *> NewOps; - for (unsigned I = 1; I != GEP->getNumOperands(); ++I) { - Value *Op = GEP->getOperand(I); - if (isa<VectorType>(Op->getType())) - NewOps.push_back(Builder.CreateExtractElement(Op, IndexC)); - else - NewOps.push_back(Op); + GetElementPtrInst *NewGEP = GetElementPtrInst::Create( + cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr, + NewOps); + NewGEP->setIsInBounds(GEP->isInBounds()); + return NewGEP; } - - GetElementPtrInst *NewGEP = GetElementPtrInst::Create( - cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr, - NewOps); - NewGEP->setIsInBounds(GEP->isInBounds()); - return NewGEP; } - return nullptr; } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) { // If this is extracting an element from a shufflevector, figure out where // it came from and extract from the appropriate input element instead. @@ -554,6 +534,44 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) { } } } + + // Run demanded elements after other transforms as this can drop flags on + // binops. If there's two paths to the same final result, we prefer the + // one which doesn't force us to drop flags. + if (IndexC) { + ElementCount EC = EI.getVectorOperandType()->getElementCount(); + unsigned NumElts = EC.getKnownMinValue(); + // This instruction only demands the single element from the input vector. + // Skip for scalable type, the number of elements is unknown at + // compile-time. + if (!EC.isScalable() && NumElts != 1) { + // If the input vector has a single use, simplify it based on this use + // property. + if (SrcVec->hasOneUse()) { + APInt UndefElts(NumElts, 0); + APInt DemandedElts(NumElts, 0); + DemandedElts.setBit(IndexC->getZExtValue()); + if (Value *V = + SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) + return replaceOperand(EI, 0, V); + } else { + // If the input vector has multiple uses, simplify it based on a union + // of all elements used. + APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec); + if (!DemandedElts.isAllOnes()) { + APInt UndefElts(NumElts, 0); + if (Value *V = SimplifyDemandedVectorElts( + SrcVec, DemandedElts, UndefElts, 0 /* Depth */, + true /* AllowMultipleUsers */)) { + if (V != SrcVec) { + SrcVec->replaceAllUsesWith(V); + return &EI; + } + } + } + } + } + } return nullptr; } @@ -1476,6 +1494,11 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) { VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE))) return replaceInstUsesWith(IE, V); + // Canonicalize type of constant indices to i64 to simplify CSE + if (auto *IndexC = dyn_cast<ConstantInt>(IdxOp)) + if (auto *NewIdx = getPreferredVectorIndex(IndexC)) + return replaceOperand(IE, 2, NewIdx); + // If the scalar is bitcast and inserted into undef, do the insert in the // source type followed by bitcast. // TODO: Generalize for insert into any constant, not just undef? @@ -2008,9 +2031,7 @@ static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf, } /// Try to fold shuffles that are the equivalent of a vector select. -static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, - InstCombiner::BuilderTy &Builder, - const DataLayout &DL) { +Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) { if (!Shuf.isSelect()) return nullptr; @@ -2118,21 +2139,23 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, V = Builder.CreateShuffleVector(X, Y, Mask); } - Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) : - BinaryOperator::Create(BOpc, NewC, V); + Value *NewBO = ConstantsAreOp1 ? Builder.CreateBinOp(BOpc, V, NewC) : + Builder.CreateBinOp(BOpc, NewC, V); // Flags are intersected from the 2 source binops. But there are 2 exceptions: // 1. If we changed an opcode, poison conditions might have changed. // 2. If the shuffle had undef mask elements, the new binop might have undefs // where the original code did not. But if we already made a safe constant, // then there's no danger. - NewBO->copyIRFlags(B0); - NewBO->andIRFlags(B1); - if (DropNSW) - NewBO->setHasNoSignedWrap(false); - if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB) - NewBO->dropPoisonGeneratingFlags(); - return NewBO; + if (auto *NewI = dyn_cast<Instruction>(NewBO)) { + NewI->copyIRFlags(B0); + NewI->andIRFlags(B1); + if (DropNSW) + NewI->setHasNoSignedWrap(false); + if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB) + NewI->dropPoisonGeneratingFlags(); + } + return replaceInstUsesWith(Shuf, NewBO); } /// Convert a narrowing shuffle of a bitcasted vector into a vector truncate. @@ -2497,7 +2520,7 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (Instruction *I = canonicalizeInsertSplat(SVI, Builder)) return I; - if (Instruction *I = foldSelectShuffle(SVI, Builder, DL)) + if (Instruction *I = foldSelectShuffle(SVI)) return I; if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian())) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 1f81624f79e7..eb5eadba194d 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2546,7 +2546,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { return nullptr; } -static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI, +static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo &TLI, Instruction *AI) { if (isa<ConstantPointerNull>(V)) return true; @@ -2557,12 +2557,34 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI, // through bitcasts of V can cause // the result statement below to be true, even when AI and V (ex: // i8* ->i32* ->i8* of AI) are the same allocations. - return isAllocLikeFn(V, TLI) && V != AI; + return isAllocLikeFn(V, &TLI) && V != AI; +} + +/// Given a call CB which uses an address UsedV, return true if we can prove the +/// call's only possible effect is storing to V. +static bool isRemovableWrite(CallBase &CB, Value *UsedV, + const TargetLibraryInfo &TLI) { + if (!CB.use_empty()) + // TODO: add recursion if returned attribute is present + return false; + + if (CB.isTerminator()) + // TODO: remove implementation restriction + return false; + + if (!CB.willReturn() || !CB.doesNotThrow()) + return false; + + // If the only possible side effect of the call is writing to the alloca, + // and the result isn't used, we can safely remove any reads implied by the + // call including those which might read the alloca itself. + Optional<MemoryLocation> Dest = MemoryLocation::getForDest(&CB, TLI); + return Dest && Dest->Ptr == UsedV; } static bool isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakTrackingVH> &Users, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo &TLI) { SmallVector<Instruction*, 4> Worklist; Worklist.push_back(AI); @@ -2627,12 +2649,17 @@ static bool isAllocSiteRemovable(Instruction *AI, } } - if (isFreeCall(I, TLI)) { + if (isRemovableWrite(*cast<CallBase>(I), PI, TLI)) { + Users.emplace_back(I); + continue; + } + + if (isFreeCall(I, &TLI)) { Users.emplace_back(I); continue; } - if (isReallocLikeFn(I, TLI, true)) { + if (isReallocLikeFn(I, &TLI, true)) { Users.emplace_back(I); Worklist.push_back(I); continue; @@ -2676,7 +2703,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); } - if (isAllocSiteRemovable(&MI, Users, &TLI)) { + if (isAllocSiteRemovable(&MI, Users, TLI)) { for (unsigned i = 0, e = Users.size(); i != e; ++i) { // Lowering all @llvm.objectsize calls first because they may // use a bitcast/GEP of the alloca we are removing. diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 38c219ce3465..9f26b37bbc79 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -232,6 +232,12 @@ static cl::opt<int> ClTrackOrigins("dfsan-track-origins", cl::desc("Track origins of labels"), cl::Hidden, cl::init(0)); +static cl::opt<bool> ClIgnorePersonalityRoutine( + "dfsan-ignore-personality-routine", + cl::desc("If a personality routine is marked uninstrumented from the ABI " + "list, do not create a wrapper for it."), + cl::Hidden, cl::init(false)); + static StringRef getGlobalTypeString(const GlobalValue &G) { // Types of GlobalVariables are always pointer types. Type *GType = G.getValueType(); @@ -1115,7 +1121,7 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName, BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF); if (F->isVarArg()) { - NewF->removeFnAttrs(AttrBuilder().addAttribute("split-stack")); + NewF->removeFnAttr("split-stack"); CallInst::Create(DFSanVarargWrapperFn, IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "", BB); @@ -1357,9 +1363,24 @@ bool DataFlowSanitizer::runImpl(Module &M) { std::vector<Function *> FnsToInstrument; SmallPtrSet<Function *, 2> FnsWithNativeABI; SmallPtrSet<Function *, 2> FnsWithForceZeroLabel; + SmallPtrSet<Constant *, 1> PersonalityFns; for (Function &F : M) - if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F)) + if (!F.isIntrinsic() && !DFSanRuntimeFunctions.contains(&F)) { FnsToInstrument.push_back(&F); + if (F.hasPersonalityFn()) + PersonalityFns.insert(F.getPersonalityFn()->stripPointerCasts()); + } + + if (ClIgnorePersonalityRoutine) { + for (auto *C : PersonalityFns) { + assert(isa<Function>(C) && "Personality routine is not a function!"); + Function *F = cast<Function>(C); + if (!isInstrumented(F)) + FnsToInstrument.erase( + std::remove(FnsToInstrument.begin(), FnsToInstrument.end(), F), + FnsToInstrument.end()); + } + } // Give function aliases prefixes when necessary, and build wrappers where the // instrumentedness is inconsistent. diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index d1d3b8ffdf7a..de34348606ef 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -26,7 +26,9 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" @@ -40,6 +42,7 @@ #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" +#include "llvm/ProfileData/InstrProfCorrelator.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" @@ -57,6 +60,13 @@ using namespace llvm; #define DEBUG_TYPE "instrprof" +namespace llvm { +cl::opt<bool> + DebugInfoCorrelate("debug-info-correlate", cl::ZeroOrMore, + cl::desc("Use debug info to correlate profiles."), + cl::init(false)); +} // namespace llvm + namespace { cl::opt<bool> DoHashBasedCounterSplit( @@ -641,6 +651,12 @@ void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) { } void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { + // TODO: Value profiling heavily depends on the data section which is omitted + // in lightweight mode. We need to move the value profile pointer to the + // Counter struct to get this working. + assert( + !DebugInfoCorrelate && + "Value profiling is not yet supported with lightweight instrumentation"); GlobalVariable *Name = Ind->getName(); auto It = ProfileDataMap.find(Name); assert(It != ProfileDataMap.end() && It->second.DataVar && @@ -855,6 +871,12 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage(); GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility(); + // Use internal rather than private linkage so the counter variable shows up + // in the symbol table when using debug info for correlation. + if (DebugInfoCorrelate && TT.isOSBinFormatMachO() && + Linkage == GlobalValue::PrivateLinkage) + Linkage = GlobalValue::InternalLinkage; + // Due to the limitation of binder as of 2021/09/28, the duplicate weak // symbols in the same csect won't be discarded. When there are duplicate weak // symbols, we can NOT guarantee that the relocations get resolved to the @@ -916,6 +938,42 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { MaybeSetComdat(CounterPtr); CounterPtr->setLinkage(Linkage); PD.RegionCounters = CounterPtr; + if (DebugInfoCorrelate) { + if (auto *SP = Fn->getSubprogram()) { + DIBuilder DB(*M, true, SP->getUnit()); + Metadata *FunctionNameAnnotation[] = { + MDString::get(Ctx, InstrProfCorrelator::FunctionNameAttributeName), + MDString::get(Ctx, getPGOFuncNameVarInitializer(NamePtr)), + }; + Metadata *CFGHashAnnotation[] = { + MDString::get(Ctx, InstrProfCorrelator::CFGHashAttributeName), + ConstantAsMetadata::get(Inc->getHash()), + }; + Metadata *NumCountersAnnotation[] = { + MDString::get(Ctx, InstrProfCorrelator::NumCountersAttributeName), + ConstantAsMetadata::get(Inc->getNumCounters()), + }; + auto Annotations = DB.getOrCreateArray({ + MDNode::get(Ctx, FunctionNameAnnotation), + MDNode::get(Ctx, CFGHashAnnotation), + MDNode::get(Ctx, NumCountersAnnotation), + }); + auto *DICounter = DB.createGlobalVariableExpression( + SP, CounterPtr->getName(), /*LinkageName=*/StringRef(), SP->getFile(), + /*LineNo=*/0, DB.createUnspecifiedType("Profile Data Type"), + CounterPtr->hasLocalLinkage(), /*IsDefined=*/true, /*Expr=*/nullptr, + /*Decl=*/nullptr, /*TemplateParams=*/nullptr, /*AlignInBits=*/0, + Annotations); + CounterPtr->addDebugInfo(DICounter); + DB.finalize(); + } else { + std::string Msg = ("Missing debug info for function " + Fn->getName() + + "; required for profile correlation.") + .str(); + Ctx.diagnose( + DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning)); + } + } auto *Int8PtrTy = Type::getInt8PtrTy(Ctx); // Allocate statically the array of pointers to value profile nodes for @@ -939,6 +997,9 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx)); } + if (DebugInfoCorrelate) + return PD.RegionCounters; + // Create data variable. auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext()); auto *Int16Ty = Type::getInt16Ty(Ctx); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 4d15b784f486..446e601cd4d7 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -307,6 +307,11 @@ static cl::opt<bool> cl::desc("Enable KernelMemorySanitizer instrumentation"), cl::Hidden, cl::init(false)); +static cl::opt<bool> + ClDisableChecks("msan-disable-checks", + cl::desc("Apply no_sanitize to the whole file"), cl::Hidden, + cl::init(false)); + // This is an experiment to enable handling of cases where shadow is a non-zero // compile-time constant. For some unexplainable reason they were silently // ignored in the instrumentation. @@ -1095,7 +1100,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { MemorySanitizerVisitor(Function &F, MemorySanitizer &MS, const TargetLibraryInfo &TLI) : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) { - bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory); + bool SanitizeFunction = + F.hasFnAttribute(Attribute::SanitizeMemory) && !ClDisableChecks; InsertChecks = SanitizeFunction; PropagateShadow = SanitizeFunction; PoisonStack = SanitizeFunction && ClPoisonStack; @@ -1214,7 +1220,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val); Value *ShadowPtr, *OriginPtr; Type *ShadowTy = Shadow->getType(); - const Align Alignment = assumeAligned(SI->getAlignment()); + const Align Alignment = SI->getAlign(); const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true); @@ -3887,8 +3893,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { &I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true); Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0); - IRB.CreateMemSet(ShadowBase, PoisonValue, Len, - MaybeAlign(I.getAlignment())); + IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlign()); } if (PoisonStack && MS.TrackOrigins) { diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index af5946325bbb..b6ba1fc2132c 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -273,14 +273,14 @@ static cl::opt<bool> PGOVerifyBFI( "internal option -pass-remakrs-analysis=pgo.")); static cl::opt<unsigned> PGOVerifyBFIRatio( - "pgo-verify-bfi-ratio", cl::init(5), cl::Hidden, - cl::desc("Set the threshold for pgo-verify-big -- only print out " + "pgo-verify-bfi-ratio", cl::init(2), cl::Hidden, + cl::desc("Set the threshold for pgo-verify-bfi: only print out " "mismatched BFI if the difference percentage is greater than " "this value (in percentage).")); static cl::opt<unsigned> PGOVerifyBFICutoff( - "pgo-verify-bfi-cutoff", cl::init(1), cl::Hidden, - cl::desc("Set the threshold for pgo-verify-bfi -- skip the counts whose " + "pgo-verify-bfi-cutoff", cl::init(5), cl::Hidden, + cl::desc("Set the threshold for pgo-verify-bfi: skip the counts whose " "profile count value is below.")); namespace llvm { @@ -291,6 +291,8 @@ extern cl::opt<PGOViewCountsType> PGOViewCounts; // Command line option to specify the name of the function for CFG dump // Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name= extern cl::opt<std::string> ViewBlockFreqFuncName; + +extern cl::opt<bool> DebugInfoCorrelate; } // namespace llvm static cl::opt<bool> @@ -467,8 +469,9 @@ private: createProfileFileNameVar(M, InstrProfileOutput); // The variable in a comdat may be discarded by LTO. Ensure the // declaration will be retained. - appendToCompilerUsed( - M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry)); + appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, + PGOInstrumentEntry, + DebugInfoCorrelate)); return false; } std::string InstrProfileOutput; @@ -1616,7 +1619,8 @@ static bool InstrumentAllFunctions( // For the context-sensitve instrumentation, we should have a separated pass // (before LTO/ThinLTO linking) to create these variables. if (!IsCS) - createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry); + createIRLevelProfileFlagVar(M, /*IsCS=*/false, PGOInstrumentEntry, + DebugInfoCorrelate); std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers; collectComdatMembers(M, ComdatMembers); @@ -1638,8 +1642,9 @@ PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) { createProfileFileNameVar(M, CSInstrName); // The variable in a comdat may be discarded by LTO. Ensure the declaration // will be retained. - appendToCompilerUsed( - M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, PGOInstrumentEntry)); + appendToCompilerUsed(M, createIRLevelProfileFlagVar(M, /*IsCS=*/true, + PGOInstrumentEntry, + DebugInfoCorrelate)); return PreservedAnalyses::all(); } @@ -1774,7 +1779,7 @@ static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI, uint64_t Diff = (BFICountValue >= CountValue) ? BFICountValue - CountValue : CountValue - BFICountValue; - if (Diff < CountValue / 100 * PGOVerifyBFIRatio) + if (Diff <= CountValue / 100 * PGOVerifyBFIRatio) continue; } BBMisMatchNum++; diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 27f54f8026e1..37a7053d778e 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -271,8 +271,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, // subtree of BB (subtree not including the BB itself). DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap; InsertPtsMap.reserve(Orders.size() + 1); - for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { - BasicBlock *Node = *RIt; + for (BasicBlock *Node : llvm::reverse(Orders)) { bool NodeInBBs = BBs.count(Node); auto &InsertPts = InsertPtsMap[Node].first; BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 8c4523206070..dda1a2f08076 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -588,7 +588,7 @@ struct AllSwitchPaths { PrevBB = BB; } - if (TPath.isExitValueSet()) + if (TPath.isExitValueSet() && isSupported(TPath)) TPaths.push_back(TPath); } } @@ -683,6 +683,62 @@ private: return Res; } + /// The determinator BB should precede the switch-defining BB. + /// + /// Otherwise, it is possible that the state defined in the determinator block + /// defines the state for the next iteration of the loop, rather than for the + /// current one. + /// + /// Currently supported paths: + /// \code + /// < switch bb1 determ def > [ 42, determ ] + /// < switch_and_def bb1 determ > [ 42, determ ] + /// < switch_and_def_and_determ bb1 > [ 42, switch_and_def_and_determ ] + /// \endcode + /// + /// Unsupported paths: + /// \code + /// < switch bb1 def determ > [ 43, determ ] + /// < switch_and_determ bb1 def > [ 43, switch_and_determ ] + /// \endcode + bool isSupported(const ThreadingPath &TPath) { + Instruction *SwitchCondI = dyn_cast<Instruction>(Switch->getCondition()); + assert(SwitchCondI); + if (!SwitchCondI) + return false; + + const BasicBlock *SwitchCondDefBB = SwitchCondI->getParent(); + const BasicBlock *SwitchCondUseBB = Switch->getParent(); + const BasicBlock *DeterminatorBB = TPath.getDeterminatorBB(); + + assert( + SwitchCondUseBB == TPath.getPath().front() && + "The first BB in a threading path should have the switch instruction"); + if (SwitchCondUseBB != TPath.getPath().front()) + return false; + + // Make DeterminatorBB the first element in Path. + PathType Path = TPath.getPath(); + auto ItDet = std::find(Path.begin(), Path.end(), DeterminatorBB); + std::rotate(Path.begin(), ItDet, Path.end()); + + bool IsDetBBSeen = false; + bool IsDefBBSeen = false; + bool IsUseBBSeen = false; + for (BasicBlock *BB : Path) { + if (BB == DeterminatorBB) + IsDetBBSeen = true; + if (BB == SwitchCondDefBB) + IsDefBBSeen = true; + if (BB == SwitchCondUseBB) + IsUseBBSeen = true; + if (IsDetBBSeen && IsUseBBSeen && !IsDefBBSeen) + return false; + } + + return true; + } + SwitchInst *Switch; BasicBlock *SwitchBlock; OptimizationRemarkEmitter *ORE; diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index e0d3a6accadd..eadbb4293539 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -175,44 +175,6 @@ static cl::opt<bool> using OverlapIntervalsTy = std::map<int64_t, int64_t>; using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>; -/// If the value of this instruction and the memory it writes to is unused, may -/// we delete this instruction? -static bool isRemovable(Instruction *I) { - // Don't remove volatile/atomic stores. - if (StoreInst *SI = dyn_cast<StoreInst>(I)) - return SI->isUnordered(); - - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - switch (II->getIntrinsicID()) { - default: llvm_unreachable("Does not have LocForWrite"); - case Intrinsic::lifetime_end: - // Never remove dead lifetime_end's, e.g. because it is followed by a - // free. - return false; - case Intrinsic::init_trampoline: - // Always safe to remove init_trampoline. - return true; - case Intrinsic::memset: - case Intrinsic::memmove: - case Intrinsic::memcpy: - case Intrinsic::memcpy_inline: - // Don't remove volatile memory intrinsics. - return !cast<MemIntrinsic>(II)->isVolatile(); - case Intrinsic::memcpy_element_unordered_atomic: - case Intrinsic::memmove_element_unordered_atomic: - case Intrinsic::memset_element_unordered_atomic: - case Intrinsic::masked_store: - return true; - } - } - - // note: only get here for calls with analyzable writes - i.e. libcalls - if (auto *CB = dyn_cast<CallBase>(I)) - return CB->use_empty(); - - return false; -} - /// Returns true if the end of this instruction can be safely shortened in /// length. static bool isShortenableAtTheEnd(Instruction *I) { @@ -835,7 +797,7 @@ struct DSEState { auto *MD = dyn_cast_or_null<MemoryDef>(MA); if (MD && MemDefs.size() < MemorySSADefsPerBlockLimit && - (getLocForWriteEx(&I) || isMemTerminatorInst(&I))) + (getLocForWrite(&I) || isMemTerminatorInst(&I))) MemDefs.push_back(MD); } } @@ -1022,48 +984,39 @@ struct DSEState { return I.first->second; } - Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const { + Optional<MemoryLocation> getLocForWrite(Instruction *I) const { if (!I->mayWriteToMemory()) return None; - if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I)) - return {MemoryLocation::getForDest(MTI)}; + if (auto *CB = dyn_cast<CallBase>(I)) + return MemoryLocation::getForDest(CB, TLI); + + return MemoryLocation::getOrNone(I); + } + + /// Assuming this instruction has a dead analyzable write, can we delete + /// this instruction? + bool isRemovable(Instruction *I) { + assert(getLocForWrite(I) && "Must have analyzable write"); + + // Don't remove volatile/atomic stores. + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isUnordered(); if (auto *CB = dyn_cast<CallBase>(I)) { - // If the functions may write to memory we do not know about, bail out. - if (!CB->onlyAccessesArgMemory() && - !CB->onlyAccessesInaccessibleMemOrArgMem()) - return None; + // Don't remove volatile memory intrinsics. + if (auto *MI = dyn_cast<MemIntrinsic>(CB)) + return !MI->isVolatile(); - LibFunc LF; - if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) { - switch (LF) { - case LibFunc_strncpy: - if (const auto *Len = dyn_cast<ConstantInt>(CB->getArgOperand(2))) - return MemoryLocation(CB->getArgOperand(0), - LocationSize::precise(Len->getZExtValue()), - CB->getAAMetadata()); - LLVM_FALLTHROUGH; - case LibFunc_strcpy: - case LibFunc_strcat: - case LibFunc_strncat: - return {MemoryLocation::getAfter(CB->getArgOperand(0))}; - default: - break; - } - } - switch (CB->getIntrinsicID()) { - case Intrinsic::init_trampoline: - return {MemoryLocation::getAfter(CB->getArgOperand(0))}; - case Intrinsic::masked_store: - return {MemoryLocation::getForArgument(CB, 1, TLI)}; - default: - break; - } - return None; + // Never remove dead lifetime intrinsics, e.g. because they are followed + // by a free. + if (CB->isLifetimeStartOrEnd()) + return false; + + return CB->use_empty() && CB->willReturn() && CB->doesNotThrow(); } - return MemoryLocation::getOrNone(I); + return false; } /// Returns true if \p UseInst completely overwrites \p DefLoc @@ -1081,7 +1034,7 @@ struct DSEState { return false; int64_t InstWriteOffset, DepWriteOffset; - if (auto CC = getLocForWriteEx(UseInst)) + if (auto CC = getLocForWrite(UseInst)) return isOverwrite(UseInst, DefInst, *CC, DefLoc, InstWriteOffset, DepWriteOffset) == OW_Complete; return false; @@ -1093,7 +1046,7 @@ struct DSEState { << *Def->getMemoryInst() << ") is at the end the function \n"); - auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst()); + auto MaybeLoc = getLocForWrite(Def->getMemoryInst()); if (!MaybeLoc) { LLVM_DEBUG(dbgs() << " ... could not get location for write.\n"); return false; @@ -1237,30 +1190,14 @@ struct DSEState { /// loop. In particular, this guarantees that it only references a single /// MemoryLocation during execution of the containing function. bool isGuaranteedLoopInvariant(const Value *Ptr) { - auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) { - Ptr = Ptr->stripPointerCasts(); - if (auto *I = dyn_cast<Instruction>(Ptr)) { - if (isa<AllocaInst>(Ptr)) - return true; - - if (isAllocLikeFn(I, &TLI)) - return true; - - return false; - } - return true; - }; - Ptr = Ptr->stripPointerCasts(); - if (auto *I = dyn_cast<Instruction>(Ptr)) { - if (I->getParent()->isEntryBlock()) - return true; - } - if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) { - return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) && - GEP->hasAllConstantIndices(); - } - return IsGuaranteedLoopInvariantBase(Ptr); + if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) + if (GEP->hasAllConstantIndices()) + Ptr = GEP->getPointerOperand()->stripPointerCasts(); + + if (auto *I = dyn_cast<Instruction>(Ptr)) + return I->getParent()->isEntryBlock(); + return true; } // Find a MemoryDef writing to \p KillingLoc and dominating \p StartAccess, @@ -1372,7 +1309,7 @@ struct DSEState { // If Current does not have an analyzable write location or is not // removable, skip it. - CurrentLoc = getLocForWriteEx(CurrentI); + CurrentLoc = getLocForWrite(CurrentI); if (!CurrentLoc || !isRemovable(CurrentI)) { CanOptimize = false; continue; @@ -1729,14 +1666,13 @@ struct DSEState { LLVM_DEBUG( dbgs() << "Trying to eliminate MemoryDefs at the end of the function\n"); - for (int I = MemDefs.size() - 1; I >= 0; I--) { - MemoryDef *Def = MemDefs[I]; - if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst())) + for (MemoryDef *Def : llvm::reverse(MemDefs)) { + if (SkipStores.contains(Def)) continue; Instruction *DefI = Def->getMemoryInst(); - auto DefLoc = getLocForWriteEx(DefI); - if (!DefLoc) + auto DefLoc = getLocForWrite(DefI); + if (!DefLoc || !isRemovable(DefI)) continue; // NOTE: Currently eliminating writes at the end of a function is limited @@ -1763,13 +1699,19 @@ struct DSEState { /// \returns true if \p Def is a no-op store, either because it /// directly stores back a loaded value or stores zero to a calloced object. bool storeIsNoop(MemoryDef *Def, const Value *DefUO) { - StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst()); - MemSetInst *MemSet = dyn_cast<MemSetInst>(Def->getMemoryInst()); + Instruction *DefI = Def->getMemoryInst(); + StoreInst *Store = dyn_cast<StoreInst>(DefI); + MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI); Constant *StoredConstant = nullptr; if (Store) StoredConstant = dyn_cast<Constant>(Store->getOperand(0)); - if (MemSet) + else if (MemSet) StoredConstant = dyn_cast<Constant>(MemSet->getValue()); + else + return false; + + if (!isRemovable(DefI)) + return false; if (StoredConstant && StoredConstant->isNullValue()) { auto *DefUOInst = dyn_cast<Instruction>(DefUO); @@ -1902,7 +1844,7 @@ struct DSEState { bool Changed = false; for (auto OI : IOL) { Instruction *DeadI = OI.first; - MemoryLocation Loc = *getLocForWriteEx(DeadI); + MemoryLocation Loc = *getLocForWrite(DeadI); assert(isRemovable(DeadI) && "Expect only removable instruction"); const Value *Ptr = Loc.Ptr->stripPointerCasts(); @@ -1925,9 +1867,14 @@ struct DSEState { LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs that write the " "already existing value\n"); for (auto *Def : MemDefs) { - if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def) || - !isRemovable(Def->getMemoryInst())) + if (SkipStores.contains(Def) || MSSA.isLiveOnEntryDef(Def)) continue; + + Instruction *DefInst = Def->getMemoryInst(); + auto MaybeDefLoc = getLocForWrite(DefInst); + if (!MaybeDefLoc || !isRemovable(DefInst)) + continue; + MemoryDef *UpperDef; // To conserve compile-time, we avoid walking to the next clobbering def. // Instead, we just try to get the optimized access, if it exists. DSE @@ -1939,17 +1886,14 @@ struct DSEState { if (!UpperDef || MSSA.isLiveOnEntryDef(UpperDef)) continue; - Instruction *DefInst = Def->getMemoryInst(); Instruction *UpperInst = UpperDef->getMemoryInst(); - auto IsRedundantStore = [this, DefInst, - UpperInst](MemoryLocation UpperLoc) { + auto IsRedundantStore = [&]() { if (DefInst->isIdenticalTo(UpperInst)) return true; if (auto *MemSetI = dyn_cast<MemSetInst>(UpperInst)) { if (auto *SI = dyn_cast<StoreInst>(DefInst)) { - auto MaybeDefLoc = getLocForWriteEx(DefInst); - if (!MaybeDefLoc) - return false; + // MemSetInst must have a write location. + MemoryLocation UpperLoc = *getLocForWrite(UpperInst); int64_t InstWriteOffset = 0; int64_t DepWriteOffset = 0; auto OR = isOverwrite(UpperInst, DefInst, UpperLoc, *MaybeDefLoc, @@ -1962,9 +1906,7 @@ struct DSEState { return false; }; - auto MaybeUpperLoc = getLocForWriteEx(UpperInst); - if (!MaybeUpperLoc || !IsRedundantStore(*MaybeUpperLoc) || - isReadClobber(*MaybeUpperLoc, DefInst)) + if (!IsRedundantStore() || isReadClobber(*MaybeDefLoc, DefInst)) continue; LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *DefInst << '\n'); @@ -1995,7 +1937,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, MaybeKillingLoc = State.getLocForTerminator(KillingI).map( [](const std::pair<MemoryLocation, bool> &P) { return P.first; }); else - MaybeKillingLoc = State.getLocForWriteEx(KillingI); + MaybeKillingLoc = State.getLocForWrite(KillingI); if (!MaybeKillingLoc) { LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for " @@ -2059,7 +2001,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, if (!DebugCounter::shouldExecute(MemorySSACounter)) continue; - MemoryLocation DeadLoc = *State.getLocForWriteEx(DeadI); + MemoryLocation DeadLoc = *State.getLocForWrite(DeadI); if (IsMemTerm) { const Value *DeadUndObj = getUnderlyingObject(DeadLoc.Ptr); @@ -2124,8 +2066,7 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, } // Check if the store is a no-op. - if (!Shortend && isRemovable(KillingI) && - State.storeIsNoop(KillingDef, KillingUndObj)) { + if (!Shortend && State.storeIsNoop(KillingDef, KillingUndObj)) { LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *KillingI << '\n'); State.deleteDeadInstruction(KillingI); diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 90f71f7729a7..a24997dd3fd4 100644 --- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -1366,8 +1366,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); continue; } - if (auto *I = dyn_cast<Instruction>(V)) - I->andIRFlags(&Inst); + if (auto *I = dyn_cast<Instruction>(V)) { + // If I being poison triggers UB, there is no need to drop those + // flags. Otherwise, only retain flags present on both I and Inst. + // TODO: Currently some fast-math flags are not treated as + // poison-generating even though they should. Until this is fixed, + // always retain flags present on both I and Inst for floating point + // instructions. + if (isa<FPMathOperator>(I) || (I->hasPoisonGeneratingFlags() && !programUndefinedIfPoison(I))) + I->andIRFlags(&Inst); + } Inst.replaceAllUsesWith(V); salvageKnowledge(&Inst, &AC); removeMSSA(Inst); diff --git a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index e54a270fb276..44017b555769 100644 --- a/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -13,10 +13,12 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/FlattenCFG.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -24,11 +26,11 @@ using namespace llvm; #define DEBUG_TYPE "flattencfg" namespace { -struct FlattenCFGPass : public FunctionPass { +struct FlattenCFGLegacyPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid public: - FlattenCFGPass() : FunctionPass(ID) { - initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry()); + FlattenCFGLegacyPass() : FunctionPass(ID) { + initializeFlattenCFGLegacyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; @@ -39,21 +41,10 @@ public: private: AliasAnalysis *AA; }; -} - -char FlattenCFGPass::ID = 0; -INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, - false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false, - false) - -// Public interface to the FlattenCFG pass -FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); } /// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function, /// iterating until no more changes are made. -static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { +bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { bool Changed = false; bool LocalChange = true; @@ -78,8 +69,22 @@ static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { } return Changed; } +} // namespace -bool FlattenCFGPass::runOnFunction(Function &F) { +char FlattenCFGLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(FlattenCFGLegacyPass, "flattencfg", "Flatten the CFG", + false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(FlattenCFGLegacyPass, "flattencfg", "Flatten the CFG", + false, false) + +// Public interface to the FlattenCFG pass +FunctionPass *llvm::createFlattenCFGPass() { + return new FlattenCFGLegacyPass(); +} + +bool FlattenCFGLegacyPass::runOnFunction(Function &F) { AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); bool EverChanged = false; // iterativelyFlattenCFG can make some blocks dead. @@ -89,3 +94,15 @@ bool FlattenCFGPass::runOnFunction(Function &F) { } return EverChanged; } + +PreservedAnalyses FlattenCFGPass::run(Function &F, + FunctionAnalysisManager &AM) { + bool EverChanged = false; + AliasAnalysis *AA = &AM.getResult<AAManager>(F); + // iterativelyFlattenCFG can make some blocks dead. + while (iterativelyFlattenCFG(F, AA)) { + removeUnreachableBlocks(F); + EverChanged = true; + } + return EverChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 6f97f3e93123..bc792ca3d8da 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -107,11 +107,6 @@ static cl::opt<bool> ControlFlowHoisting( "licm-control-flow-hoisting", cl::Hidden, cl::init(false), cl::desc("Enable control flow (and PHI) hoisting in LICM")); -static cl::opt<unsigned> HoistSinkColdnessThreshold( - "licm-coldness-threshold", cl::Hidden, cl::init(4), - cl::desc("Relative coldness Threshold of hoisting/sinking destination " - "block for LICM to be considered beneficial")); - static cl::opt<uint32_t> MaxNumUsesTraversed( "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), cl::desc("Max num uses visited for identifying load " @@ -819,35 +814,6 @@ public: }; } // namespace -// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only -// only worthwhile if the destination block is actually colder than current -// block. -static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock, - OptimizationRemarkEmitter *ORE, - BlockFrequencyInfo *BFI) { - // Check block frequency only when runtime profile is available - // to avoid pathological cases. With static profile, lean towards - // hosting because it helps canonicalize the loop for vectorizer. - if (!DstBlock->getParent()->hasProfileData()) - return true; - - if (!HoistSinkColdnessThreshold || !BFI) - return true; - - BasicBlock *SrcBlock = I.getParent(); - if (BFI->getBlockFreq(DstBlock).getFrequency() / HoistSinkColdnessThreshold > - BFI->getBlockFreq(SrcBlock).getFrequency()) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "SinkHoistInst", &I) - << "failed to sink or hoist instruction because containing block " - "has lower frequency than destination block"; - }); - return false; - } - - return true; -} - /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -909,7 +875,6 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU, true, &Flags, ORE) && - worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) && isSafeToExecuteUnconditionally( I, DT, TLI, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { @@ -1741,7 +1706,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, // First check if I is worth sinking for all uses. Sink only when it is worth // across all uses. SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end()); - SmallVector<PHINode *, 8> ExitPNs; for (auto *UI : Users) { auto *User = cast<Instruction>(UI); @@ -1751,14 +1715,6 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, PHINode *PN = cast<PHINode>(User); assert(ExitBlockSet.count(PN->getParent()) && "The LCSSA PHI is not in an exit block!"); - if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) { - return Changed; - } - - ExitPNs.push_back(PN); - } - - for (auto *PN : ExitPNs) { // The PHI must be trivially replaceable. Instruction *New = sinkThroughTriviallyReplaceablePHI( diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 77d76609c926..57e36e5b9b90 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -224,8 +224,8 @@ bool LoopDataPrefetch::run() { bool MadeChange = false; for (Loop *I : *LI) - for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) - MadeChange |= runOnLoop(*L); + for (Loop *L : depth_first(I)) + MadeChange |= runOnLoop(L); return MadeChange; } diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 42da86a9ecf5..5d00fa56e888 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -786,9 +786,9 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, Type *IntIdxTy = DL->getIndexType(StorePtr->getType()); const SCEV *StoreSizeSCEV = SE->getConstant(IntIdxTy, StoreSize); if (processLoopStridedStore(StorePtr, StoreSizeSCEV, - MaybeAlign(HeadStore->getAlignment()), - StoredVal, HeadStore, AdjacentStores, StoreEv, - BECount, IsNegStride)) { + MaybeAlign(HeadStore->getAlign()), StoredVal, + HeadStore, AdjacentStores, StoreEv, BECount, + IsNegStride)) { TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); Changed = true; } @@ -967,12 +967,22 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, << "\n"); if (PositiveStrideSCEV != MemsetSizeSCEV) { - // TODO: folding can be done to the SCEVs - // The folding is to fold expressions that is covered by the loop guard - // at loop entry. After the folding, compare again and proceed - // optimization if equal. - LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n"); - return false; + // If an expression is covered by the loop guard, compare again and + // proceed with optimization if equal. + const SCEV *FoldedPositiveStride = + SE->applyLoopGuards(PositiveStrideSCEV, CurLoop); + const SCEV *FoldedMemsetSize = + SE->applyLoopGuards(MemsetSizeSCEV, CurLoop); + + LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n" + << " FoldedMemsetSize: " << *FoldedMemsetSize << "\n" + << " FoldedPositiveStride: " << *FoldedPositiveStride + << "\n"); + + if (FoldedPositiveStride != FoldedMemsetSize) { + LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n"); + return false; + } } } diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 56d66b93dd69..9d22eceb987f 100644 --- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -1456,16 +1456,12 @@ void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) { } // Remove instructions associated with non-base iterations. - for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend(); - J != JE;) { - unsigned I = Uses[&*J].find_first(); + for (Instruction &Inst : llvm::make_early_inc_range(llvm::reverse(*Header))) { + unsigned I = Uses[&Inst].find_first(); if (I > 0 && I < IL_All) { - LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n"); - J++->eraseFromParent(); - continue; + LLVM_DEBUG(dbgs() << "LRR: removing: " << Inst << "\n"); + Inst.eraseFromParent(); } - - ++J; } // Rewrite each BaseInst using SCEV. diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index a9a2266e1196..798af48c2337 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -6011,7 +6011,7 @@ struct SCEVDbgValueBuilder { // See setFinalExpression: prepend our opcodes on the start of any old // expression opcodes. assert(!DI.hasArgList()); - llvm::SmallVector<uint64_t, 6> FinalExpr(Expr.begin() + 2, Expr.end()); + llvm::SmallVector<uint64_t, 6> FinalExpr(llvm::drop_begin(Expr, 2)); auto *NewExpr = DIExpression::prependOpcodes(OldExpr, FinalExpr, /*StackValue*/ true); DI.setExpression(NewExpr); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 39c8b65968aa..893928fb0560 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1136,6 +1136,31 @@ static LoopUnrollResult tryToUnrollLoop( TransformationMode TM = hasUnrollTransformation(L); if (TM & TM_Disable) return LoopUnrollResult::Unmodified; + + // If this loop isn't forced to be unrolled, avoid unrolling it when the + // parent loop has an explicit unroll-and-jam pragma. This is to prevent + // automatic unrolling from interfering with the user requested + // transformation. + Loop *ParentL = L->getParentLoop(); + if (ParentL != NULL && + hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser && + hasUnrollTransformation(L) != TM_ForcedByUser) { + LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has" + << " llvm.loop.unroll_and_jam.\n"); + return LoopUnrollResult::Unmodified; + } + + // If this loop isn't forced to be unrolled, avoid unrolling it when the + // loop has an explicit unroll-and-jam pragma. This is to prevent automatic + // unrolling from interfering with the user requested transformation. + if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser && + hasUnrollTransformation(L) != TM_ForcedByUser) { + LLVM_DEBUG( + dbgs() + << " Not unrolling loop since it has llvm.loop.unroll_and_jam.\n"); + return LoopUnrollResult::Unmodified; + } + if (!L->isLoopSimplifyForm()) { LLVM_DEBUG( dbgs() << " Not unrolling loop which is not in loop-simplify form.\n"); diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp index 91215cd19e2b..10a8742940b1 100644 --- a/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -638,6 +638,7 @@ class NewGVN { BitVector TouchedInstructions; DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange; + mutable DenseMap<const IntrinsicInst *, const Value *> IntrinsicInstPred; #ifndef NDEBUG // Debugging for how many times each block and instruction got processed. @@ -794,7 +795,7 @@ private: BasicBlock *PHIBlock) const; const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; ExprResult performSymbolicCmpEvaluation(Instruction *) const; - ExprResult performSymbolicPredicateInfoEvaluation(Instruction *) const; + ExprResult performSymbolicPredicateInfoEvaluation(IntrinsicInst *) const; // Congruence finding. bool someEquivalentDominates(const Instruction *, const Instruction *) const; @@ -815,6 +816,8 @@ private: // Ranking unsigned int getRank(const Value *) const; bool shouldSwapOperands(const Value *, const Value *) const; + bool shouldSwapOperandsForIntrinsic(const Value *, const Value *, + const IntrinsicInst *I) const; // Reachability handling. void updateReachableEdge(BasicBlock *, BasicBlock *); @@ -1552,7 +1555,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { } NewGVN::ExprResult -NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { +NewGVN::performSymbolicPredicateInfoEvaluation(IntrinsicInst *I) const { auto *PI = PredInfo->getPredicateInfoFor(I); if (!PI) return ExprResult::none(); @@ -1572,7 +1575,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { Value *AdditionallyUsedValue = CmpOp0; // Sort the ops. - if (shouldSwapOperands(FirstOp, SecondOp)) { + if (shouldSwapOperandsForIntrinsic(FirstOp, SecondOp, I)) { std::swap(FirstOp, SecondOp); Predicate = CmpInst::getSwappedPredicate(Predicate); AdditionallyUsedValue = CmpOp1; @@ -1598,7 +1601,7 @@ NewGVN::ExprResult NewGVN::performSymbolicCallEvaluation(Instruction *I) const { // Intrinsics with the returned attribute are copies of arguments. if (auto *ReturnedValue = II->getReturnedArgOperand()) { if (II->getIntrinsicID() == Intrinsic::ssa_copy) - if (auto Res = performSymbolicPredicateInfoEvaluation(I)) + if (auto Res = performSymbolicPredicateInfoEvaluation(II)) return Res; return ExprResult::some(createVariableOrConstant(ReturnedValue)); } @@ -2951,6 +2954,7 @@ void NewGVN::cleanupTables() { PredicateToUsers.clear(); MemoryToUsers.clear(); RevisitOnReachabilityChange.clear(); + IntrinsicInstPred.clear(); } // Assign local DFS number mapping to instructions, and leave space for Value @@ -4152,6 +4156,29 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); } +bool NewGVN::shouldSwapOperandsForIntrinsic(const Value *A, const Value *B, + const IntrinsicInst *I) const { + auto LookupResult = IntrinsicInstPred.find(I); + if (shouldSwapOperands(A, B)) { + if (LookupResult == IntrinsicInstPred.end()) + IntrinsicInstPred.insert({I, B}); + else + LookupResult->second = B; + return true; + } + + if (LookupResult != IntrinsicInstPred.end()) { + auto *SeenPredicate = LookupResult->second; + if (SeenPredicate) { + if (SeenPredicate == B) + return true; + else + LookupResult->second = nullptr; + } + } + return false; +} + namespace { class NewGVNLegacyPass : public FunctionPass { diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 2d3490b2d29e..e12eca0ed287 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1359,16 +1359,6 @@ static constexpr Attribute::AttrKind FnAttrsToStrip[] = Attribute::InaccessibleMemOrArgMemOnly, Attribute::NoSync, Attribute::NoFree}; -// List of all parameter and return attributes which must be stripped when -// lowering from the abstract machine model. Note that we list attributes -// here which aren't valid as return attributes, that is okay. There are -// also some additional attributes with arguments which are handled -// explicitly and are not in this list. -static constexpr Attribute::AttrKind ParamAttrsToStrip[] = - {Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly, - Attribute::NoAlias, Attribute::NoFree}; - - // Create new attribute set containing only attributes which can be transferred // from original call to the safepoint. static AttributeList legalizeCallAttributes(LLVMContext &Ctx, @@ -2650,24 +2640,19 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, return !Records.empty(); } -// Handles both return values and arguments for Functions and calls. -template <typename AttrHolder> -static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, - unsigned Index) { +// List of all parameter and return attributes which must be stripped when +// lowering from the abstract machine model. Note that we list attributes +// here which aren't valid as return attributes, that is okay. +static AttrBuilder getParamAndReturnAttributesToRemove() { AttrBuilder R; - AttributeSet AS = AH.getAttributes().getAttributes(Index); - if (AS.getDereferenceableBytes()) - R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable, - AS.getDereferenceableBytes())); - if (AS.getDereferenceableOrNullBytes()) - R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, - AS.getDereferenceableOrNullBytes())); - for (auto Attr : ParamAttrsToStrip) - if (AS.hasAttribute(Attr)) - R.addAttribute(Attr); - - if (!R.empty()) - AH.setAttributes(AH.getAttributes().removeAttributesAtIndex(Ctx, Index, R)); + R.addDereferenceableAttr(1); + R.addDereferenceableOrNullAttr(1); + R.addAttribute(Attribute::ReadNone); + R.addAttribute(Attribute::ReadOnly); + R.addAttribute(Attribute::WriteOnly); + R.addAttribute(Attribute::NoAlias); + R.addAttribute(Attribute::NoFree); + return R; } static void stripNonValidAttributesFromPrototype(Function &F) { @@ -2683,13 +2668,13 @@ static void stripNonValidAttributesFromPrototype(Function &F) { return; } + AttrBuilder R = getParamAndReturnAttributesToRemove(); for (Argument &A : F.args()) if (isa<PointerType>(A.getType())) - RemoveNonValidAttrAtIndex(Ctx, F, - A.getArgNo() + AttributeList::FirstArgIndex); + F.removeParamAttrs(A.getArgNo(), R); if (isa<PointerType>(F.getReturnType())) - RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex); + F.removeRetAttrs(R); for (auto Attr : FnAttrsToStrip) F.removeFnAttr(Attr); @@ -2757,13 +2742,13 @@ static void stripNonValidDataFromBody(Function &F) { stripInvalidMetadataFromInstruction(I); + AttrBuilder R = getParamAndReturnAttributesToRemove(); if (auto *Call = dyn_cast<CallBase>(&I)) { for (int i = 0, e = Call->arg_size(); i != e; i++) if (isa<PointerType>(Call->getArgOperand(i)->getType())) - RemoveNonValidAttrAtIndex(Ctx, *Call, - i + AttributeList::FirstArgIndex); + Call->removeParamAttrs(i, R); if (isa<PointerType>(Call->getType())) - RemoveNonValidAttrAtIndex(Ctx, *Call, AttributeList::ReturnIndex); + Call->removeRetAttrs(R); } } diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 28e00c873361..ff2f8a25f379 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -101,8 +101,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { Constant *Const = nullptr; if (V->getType()->isStructTy()) { std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V); - if (any_of(IVs, - [](const ValueLatticeElement &LV) { return isOverdefined(LV); })) + if (llvm::any_of(IVs, isOverdefined)) return false; std::vector<Constant *> ConstVals; auto *ST = cast<StructType>(V->getType()); diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index a041af0d70d0..f9650efc051f 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -54,7 +54,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeMakeGuardsExplicitLegacyPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); initializeGVNSinkLegacyPassPass(Registry); - initializeFlattenCFGPassPass(Registry); + initializeFlattenCFGLegacyPassPass(Registry); initializeIRCELegacyPassPass(Registry); initializeIndVarSimplifyLegacyPassPass(Registry); initializeInferAddressSpacesPass(Registry); diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index ffa2f9adb978..d23925042b0a 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -648,13 +648,13 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) { Value *Current = V; // ExtInsts is built in the use-def order. Therefore, we apply them to V // in the reversed order. - for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) { + for (CastInst *I : llvm::reverse(ExtInsts)) { if (Constant *C = dyn_cast<Constant>(Current)) { // If Current is a constant, apply s/zext using ConstantExpr::getCast. // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt. - Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType()); + Current = ConstantExpr::getCast(I->getOpcode(), C, I->getType()); } else { - Instruction *Ext = (*I)->clone(); + Instruction *Ext = I->clone(); Ext->setOperand(0, Current); Ext->insertBefore(IP); Current = Ext; diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp new file mode 100644 index 000000000000..dfb9f608eab2 --- /dev/null +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -0,0 +1,942 @@ +//===- CodeLayout.cpp - Implementation of code layout algorithms ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// ExtTSP - layout of basic blocks with i-cache optimization. +// +// The algorithm tries to find a layout of nodes (basic blocks) of a given CFG +// optimizing jump locality and thus processor I-cache utilization. This is +// achieved via increasing the number of fall-through jumps and co-locating +// frequently executed nodes together. The name follows the underlying +// optimization problem, Extended-TSP, which is a generalization of classical +// (maximum) Traveling Salesmen Problem. +// +// The algorithm is a greedy heuristic that works with chains (ordered lists) +// of basic blocks. Initially all chains are isolated basic blocks. On every +// iteration, we pick a pair of chains whose merging yields the biggest increase +// in the ExtTSP score, which models how i-cache "friendly" a specific chain is. +// A pair of chains giving the maximum gain is merged into a new chain. The +// procedure stops when there is only one chain left, or when merging does not +// increase ExtTSP. In the latter case, the remaining chains are sorted by +// density in the decreasing order. +// +// An important aspect is the way two chains are merged. Unlike earlier +// algorithms (e.g., based on the approach of Pettis-Hansen), two +// chains, X and Y, are first split into three, X1, X2, and Y. Then we +// consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y, +// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. +// This improves the quality of the final result (the search space is larger) +// while keeping the implementation sufficiently fast. +// +// Reference: +// * A. Newell and S. Pupyrev, Improved Basic Block Reordering, +// IEEE Transactions on Computers, 2020 +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/CodeLayout.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +#define DEBUG_TYPE "code-layout" + +// Algorithm-specific constants. The values are tuned for the best performance +// of large-scale front-end bound binaries. +static cl::opt<double> + ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1), + cl::desc("The weight of forward jumps for ExtTSP value")); + +static cl::opt<double> + BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1), + cl::desc("The weight of backward jumps for ExtTSP value")); + +static cl::opt<unsigned> ForwardDistance( + "ext-tsp-forward-distance", cl::Hidden, cl::init(1024), + cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP")); + +static cl::opt<unsigned> BackwardDistance( + "ext-tsp-backward-distance", cl::Hidden, cl::init(640), + cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP")); + +// The maximum size of a chain for splitting. Larger values of the threshold +// may yield better quality at the cost of worsen run-time. +static cl::opt<unsigned> ChainSplitThreshold( + "ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128), + cl::desc("The maximum size of a chain to apply splitting")); + +// The option enables splitting (large) chains along in-coming and out-going +// jumps. This typically results in a better quality. +static cl::opt<bool> EnableChainSplitAlongJumps( + "ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true), + cl::desc("The maximum size of a chain to apply splitting")); + +namespace { + +// Epsilon for comparison of doubles. +constexpr double EPS = 1e-8; + +// Compute the Ext-TSP score for a jump between a given pair of blocks, +// using their sizes, (estimated) addresses and the jump execution count. +double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr, + uint64_t Count) { + // Fallthrough + if (SrcAddr + SrcSize == DstAddr) { + // Assume that FallthroughWeight = 1.0 after normalization + return static_cast<double>(Count); + } + // Forward + if (SrcAddr + SrcSize < DstAddr) { + const auto Dist = DstAddr - (SrcAddr + SrcSize); + if (Dist <= ForwardDistance) { + double Prob = 1.0 - static_cast<double>(Dist) / ForwardDistance; + return ForwardWeight * Prob * Count; + } + return 0; + } + // Backward + const auto Dist = SrcAddr + SrcSize - DstAddr; + if (Dist <= BackwardDistance) { + double Prob = 1.0 - static_cast<double>(Dist) / BackwardDistance; + return BackwardWeight * Prob * Count; + } + return 0; +} + +/// A type of merging two chains, X and Y. The former chain is split into +/// X1 and X2 and then concatenated with Y in the order specified by the type. +enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y }; + +/// The gain of merging two chains, that is, the Ext-TSP score of the merge +/// together with the corresponfiding merge 'type' and 'offset'. +class MergeGainTy { +public: + explicit MergeGainTy() {} + explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType) + : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {} + + double score() const { return Score; } + + size_t mergeOffset() const { return MergeOffset; } + + MergeTypeTy mergeType() const { return MergeType; } + + // Returns 'true' iff Other is preferred over this. + bool operator<(const MergeGainTy &Other) const { + return (Other.Score > EPS && Other.Score > Score + EPS); + } + + // Update the current gain if Other is preferred over this. + void updateIfLessThan(const MergeGainTy &Other) { + if (*this < Other) + *this = Other; + } + +private: + double Score{-1.0}; + size_t MergeOffset{0}; + MergeTypeTy MergeType{MergeTypeTy::X_Y}; +}; + +class Block; +class Jump; +class Chain; +class ChainEdge; + +/// A node in the graph, typically corresponding to a basic block in CFG. +class Block { +public: + Block(const Block &) = delete; + Block(Block &&) = default; + Block &operator=(const Block &) = delete; + Block &operator=(Block &&) = default; + + // The original index of the block in CFG. + size_t Index{0}; + // The index of the block in the current chain. + size_t CurIndex{0}; + // Size of the block in the binary. + uint64_t Size{0}; + // Execution count of the block in the profile data. + uint64_t ExecutionCount{0}; + // Current chain of the node. + Chain *CurChain{nullptr}; + // An offset of the block in the current chain. + mutable uint64_t EstimatedAddr{0}; + // Forced successor of the block in CFG. + Block *ForcedSucc{nullptr}; + // Forced predecessor of the block in CFG. + Block *ForcedPred{nullptr}; + // Outgoing jumps from the block. + std::vector<Jump *> OutJumps; + // Incoming jumps to the block. + std::vector<Jump *> InJumps; + +public: + explicit Block(size_t Index, uint64_t Size_, uint64_t EC) + : Index(Index), Size(Size_), ExecutionCount(EC) {} + bool isEntry() const { return Index == 0; } +}; + +/// An arc in the graph, typically corresponding to a jump between two blocks. +class Jump { +public: + Jump(const Jump &) = delete; + Jump(Jump &&) = default; + Jump &operator=(const Jump &) = delete; + Jump &operator=(Jump &&) = default; + + // Source block of the jump. + Block *Source; + // Target block of the jump. + Block *Target; + // Execution count of the arc in the profile data. + uint64_t ExecutionCount{0}; + +public: + explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount) + : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {} +}; + +/// A chain (ordered sequence) of blocks. +class Chain { +public: + Chain(const Chain &) = delete; + Chain(Chain &&) = default; + Chain &operator=(const Chain &) = delete; + Chain &operator=(Chain &&) = default; + + explicit Chain(uint64_t Id, Block *Block) + : Id(Id), Score(0), Blocks(1, Block) {} + + uint64_t id() const { return Id; } + + bool isEntry() const { return Blocks[0]->Index == 0; } + + double score() const { return Score; } + + void setScore(double NewScore) { Score = NewScore; } + + const std::vector<Block *> &blocks() const { return Blocks; } + + const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const { + return Edges; + } + + ChainEdge *getEdge(Chain *Other) const { + for (auto It : Edges) { + if (It.first == Other) + return It.second; + } + return nullptr; + } + + void removeEdge(Chain *Other) { + auto It = Edges.begin(); + while (It != Edges.end()) { + if (It->first == Other) { + Edges.erase(It); + return; + } + It++; + } + } + + void addEdge(Chain *Other, ChainEdge *Edge) { + Edges.push_back(std::make_pair(Other, Edge)); + } + + void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) { + Blocks = MergedBlocks; + // Update the block's chains + for (size_t Idx = 0; Idx < Blocks.size(); Idx++) { + Blocks[Idx]->CurChain = this; + Blocks[Idx]->CurIndex = Idx; + } + } + + void mergeEdges(Chain *Other); + + void clear() { + Blocks.clear(); + Blocks.shrink_to_fit(); + Edges.clear(); + Edges.shrink_to_fit(); + } + +private: + // Unique chain identifier. + uint64_t Id; + // Cached ext-tsp score for the chain. + double Score; + // Blocks of the chain. + std::vector<Block *> Blocks; + // Adjacent chains and corresponding edges (lists of jumps). + std::vector<std::pair<Chain *, ChainEdge *>> Edges; +}; + +/// An edge in CFG representing jumps between two chains. +/// When blocks are merged into chains, the edges are combined too so that +/// there is always at most one edge between a pair of chains +class ChainEdge { +public: + ChainEdge(const ChainEdge &) = delete; + ChainEdge(ChainEdge &&) = default; + ChainEdge &operator=(const ChainEdge &) = delete; + ChainEdge &operator=(ChainEdge &&) = default; + + explicit ChainEdge(Jump *Jump) + : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain), + Jumps(1, Jump) {} + + const std::vector<Jump *> &jumps() const { return Jumps; } + + void changeEndpoint(Chain *From, Chain *To) { + if (From == SrcChain) + SrcChain = To; + if (From == DstChain) + DstChain = To; + } + + void appendJump(Jump *Jump) { Jumps.push_back(Jump); } + + void moveJumps(ChainEdge *Other) { + Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end()); + Other->Jumps.clear(); + Other->Jumps.shrink_to_fit(); + } + + bool hasCachedMergeGain(Chain *Src, Chain *Dst) const { + return Src == SrcChain ? CacheValidForward : CacheValidBackward; + } + + MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const { + return Src == SrcChain ? CachedGainForward : CachedGainBackward; + } + + void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) { + if (Src == SrcChain) { + CachedGainForward = MergeGain; + CacheValidForward = true; + } else { + CachedGainBackward = MergeGain; + CacheValidBackward = true; + } + } + + void invalidateCache() { + CacheValidForward = false; + CacheValidBackward = false; + } + +private: + // Source chain. + Chain *SrcChain{nullptr}; + // Destination chain. + Chain *DstChain{nullptr}; + // Original jumps in the binary with correspinding execution counts. + std::vector<Jump *> Jumps; + // Cached ext-tsp value for merging the pair of chains. + // Since the gain of merging (Src, Dst) and (Dst, Src) might be different, + // we store both values here. + MergeGainTy CachedGainForward; + MergeGainTy CachedGainBackward; + // Whether the cached value must be recomputed. + bool CacheValidForward{false}; + bool CacheValidBackward{false}; +}; + +void Chain::mergeEdges(Chain *Other) { + assert(this != Other && "cannot merge a chain with itself"); + + // Update edges adjacent to chain Other + for (auto EdgeIt : Other->Edges) { + const auto DstChain = EdgeIt.first; + const auto DstEdge = EdgeIt.second; + const auto TargetChain = DstChain == Other ? this : DstChain; + auto CurEdge = getEdge(TargetChain); + if (CurEdge == nullptr) { + DstEdge->changeEndpoint(Other, this); + this->addEdge(TargetChain, DstEdge); + if (DstChain != this && DstChain != Other) { + DstChain->addEdge(this, DstEdge); + } + } else { + CurEdge->moveJumps(DstEdge); + } + // Cleanup leftover edge + if (DstChain != Other) { + DstChain->removeEdge(Other); + } + } +} + +using BlockIter = std::vector<Block *>::const_iterator; + +/// A wrapper around three chains of blocks; it is used to avoid extra +/// instantiation of the vectors. +class MergedChain { +public: + MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(), + BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(), + BlockIter End3 = BlockIter()) + : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3), + End3(End3) {} + + template <typename F> void forEach(const F &Func) const { + for (auto It = Begin1; It != End1; It++) + Func(*It); + for (auto It = Begin2; It != End2; It++) + Func(*It); + for (auto It = Begin3; It != End3; It++) + Func(*It); + } + + std::vector<Block *> getBlocks() const { + std::vector<Block *> Result; + Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) + + std::distance(Begin3, End3)); + Result.insert(Result.end(), Begin1, End1); + Result.insert(Result.end(), Begin2, End2); + Result.insert(Result.end(), Begin3, End3); + return Result; + } + + const Block *getFirstBlock() const { return *Begin1; } + +private: + BlockIter Begin1; + BlockIter End1; + BlockIter Begin2; + BlockIter End2; + BlockIter Begin3; + BlockIter End3; +}; + +/// The implementation of the ExtTSP algorithm. +class ExtTSPImpl { + using EdgeT = std::pair<uint64_t, uint64_t>; + using EdgeCountMap = DenseMap<EdgeT, uint64_t>; + +public: + ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const EdgeCountMap &EdgeCounts) + : NumNodes(NumNodes) { + initialize(NodeSizes, NodeCounts, EdgeCounts); + } + + /// Run the algorithm and return an optimized ordering of blocks. + void run(std::vector<uint64_t> &Result) { + // Pass 1: Merge blocks with their mutually forced successors + mergeForcedPairs(); + + // Pass 2: Merge pairs of chains while improving the ExtTSP objective + mergeChainPairs(); + + // Pass 3: Merge cold blocks to reduce code size + mergeColdChains(); + + // Collect blocks from all chains + concatChains(Result); + } + +private: + /// Initialize the algorithm's data structures. + void initialize(const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const EdgeCountMap &EdgeCounts) { + // Initialize blocks + AllBlocks.reserve(NumNodes); + for (uint64_t Node = 0; Node < NumNodes; Node++) { + uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL); + uint64_t ExecutionCount = NodeCounts[Node]; + // The execution count of the entry block is set to at least 1 + if (Node == 0 && ExecutionCount == 0) + ExecutionCount = 1; + AllBlocks.emplace_back(Node, Size, ExecutionCount); + } + + // Initialize jumps between blocks + SuccNodes = std::vector<std::vector<uint64_t>>(NumNodes); + PredNodes = std::vector<std::vector<uint64_t>>(NumNodes); + AllJumps.reserve(EdgeCounts.size()); + for (auto It : EdgeCounts) { + auto Pred = It.first.first; + auto Succ = It.first.second; + // Ignore self-edges + if (Pred == Succ) + continue; + + SuccNodes[Pred].push_back(Succ); + PredNodes[Succ].push_back(Pred); + auto ExecutionCount = It.second; + if (ExecutionCount > 0) { + auto &Block = AllBlocks[Pred]; + auto &SuccBlock = AllBlocks[Succ]; + AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount); + SuccBlock.InJumps.push_back(&AllJumps.back()); + Block.OutJumps.push_back(&AllJumps.back()); + } + } + + // Initialize chains + AllChains.reserve(NumNodes); + HotChains.reserve(NumNodes); + for (auto &Block : AllBlocks) { + AllChains.emplace_back(Block.Index, &Block); + Block.CurChain = &AllChains.back(); + if (Block.ExecutionCount > 0) { + HotChains.push_back(&AllChains.back()); + } + } + + // Initialize chain edges + AllEdges.reserve(AllJumps.size()); + for (auto &Block : AllBlocks) { + for (auto &Jump : Block.OutJumps) { + const auto SuccBlock = Jump->Target; + auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain); + // this edge is already present in the graph + if (CurEdge != nullptr) { + assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr); + CurEdge->appendJump(Jump); + continue; + } + // this is a new edge + AllEdges.emplace_back(Jump); + Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back()); + SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back()); + } + } + } + + /// For a pair of blocks, A and B, block B is the forced successor of A, + /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps + /// to B are from A. Such blocks should be adjacent in the optimal ordering; + /// the method finds and merges such pairs of blocks. + void mergeForcedPairs() { + // Find fallthroughs based on edge weights + for (auto &Block : AllBlocks) { + if (SuccNodes[Block.Index].size() == 1 && + PredNodes[SuccNodes[Block.Index][0]].size() == 1 && + SuccNodes[Block.Index][0] != 0) { + size_t SuccIndex = SuccNodes[Block.Index][0]; + Block.ForcedSucc = &AllBlocks[SuccIndex]; + AllBlocks[SuccIndex].ForcedPred = &Block; + } + } + + // There might be 'cycles' in the forced dependencies, since profile + // data isn't 100% accurate. Typically this is observed in loops, when the + // loop edges are the hottest successors for the basic blocks of the loop. + // Break the cycles by choosing the block with the smallest index as the + // head. This helps to keep the original order of the loops, which likely + // have already been rotated in the optimized manner. + for (auto &Block : AllBlocks) { + if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr) + continue; + + auto SuccBlock = Block.ForcedSucc; + while (SuccBlock != nullptr && SuccBlock != &Block) { + SuccBlock = SuccBlock->ForcedSucc; + } + if (SuccBlock == nullptr) + continue; + // Break the cycle + AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr; + Block.ForcedPred = nullptr; + } + + // Merge blocks with their fallthrough successors + for (auto &Block : AllBlocks) { + if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) { + auto CurBlock = &Block; + while (CurBlock->ForcedSucc != nullptr) { + const auto NextBlock = CurBlock->ForcedSucc; + mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y); + CurBlock = NextBlock; + } + } + } + } + + /// Merge pairs of chains while improving the ExtTSP objective. + void mergeChainPairs() { + /// Deterministically compare pairs of chains + auto compareChainPairs = [](const Chain *A1, const Chain *B1, + const Chain *A2, const Chain *B2) { + if (A1 != A2) + return A1->id() < A2->id(); + return B1->id() < B2->id(); + }; + + while (HotChains.size() > 1) { + Chain *BestChainPred = nullptr; + Chain *BestChainSucc = nullptr; + auto BestGain = MergeGainTy(); + // Iterate over all pairs of chains + for (auto ChainPred : HotChains) { + // Get candidates for merging with the current chain + for (auto EdgeIter : ChainPred->edges()) { + auto ChainSucc = EdgeIter.first; + auto ChainEdge = EdgeIter.second; + // Ignore loop edges + if (ChainPred == ChainSucc) + continue; + + // Compute the gain of merging the two chains + auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge); + if (CurGain.score() <= EPS) + continue; + + if (BestGain < CurGain || + (std::abs(CurGain.score() - BestGain.score()) < EPS && + compareChainPairs(ChainPred, ChainSucc, BestChainPred, + BestChainSucc))) { + BestGain = CurGain; + BestChainPred = ChainPred; + BestChainSucc = ChainSucc; + } + } + } + + // Stop merging when there is no improvement + if (BestGain.score() <= EPS) + break; + + // Merge the best pair of chains + mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(), + BestGain.mergeType()); + } + } + + /// Merge cold blocks to reduce code size. + void mergeColdChains() { + for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) { + // Iterating over neighbors in the reverse order to make sure original + // fallthrough jumps are merged first + size_t NumSuccs = SuccNodes[SrcBB].size(); + for (size_t Idx = 0; Idx < NumSuccs; Idx++) { + auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1]; + auto SrcChain = AllBlocks[SrcBB].CurChain; + auto DstChain = AllBlocks[DstBB].CurChain; + if (SrcChain != DstChain && !DstChain->isEntry() && + SrcChain->blocks().back()->Index == SrcBB && + DstChain->blocks().front()->Index == DstBB) { + mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y); + } + } + } + } + + /// Compute the Ext-TSP score for a given block order and a list of jumps. + double extTSPScore(const MergedChain &MergedBlocks, + const std::vector<Jump *> &Jumps) const { + if (Jumps.empty()) + return 0.0; + uint64_t CurAddr = 0; + MergedBlocks.forEach([&](const Block *BB) { + BB->EstimatedAddr = CurAddr; + CurAddr += BB->Size; + }); + + double Score = 0; + for (auto &Jump : Jumps) { + const auto SrcBlock = Jump->Source; + const auto DstBlock = Jump->Target; + Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size, + DstBlock->EstimatedAddr, Jump->ExecutionCount); + } + return Score; + } + + /// Compute the gain of merging two chains. + /// + /// The function considers all possible ways of merging two chains and + /// computes the one having the largest increase in ExtTSP objective. The + /// result is a pair with the first element being the gain and the second + /// element being the corresponding merging type. + MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc, + ChainEdge *Edge) const { + if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) { + return Edge->getCachedMergeGain(ChainPred, ChainSucc); + } + + // Precompute jumps between ChainPred and ChainSucc + auto Jumps = Edge->jumps(); + auto EdgePP = ChainPred->getEdge(ChainPred); + if (EdgePP != nullptr) { + Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end()); + } + assert(!Jumps.empty() && "trying to merge chains w/o jumps"); + + // The object holds the best currently chosen gain of merging the two chains + MergeGainTy Gain = MergeGainTy(); + + /// Given a merge offset and a list of merge types, try to merge two chains + /// and update Gain with a better alternative + auto tryChainMerging = [&](size_t Offset, + const std::vector<MergeTypeTy> &MergeTypes) { + // Skip merging corresponding to concatenation w/o splitting + if (Offset == 0 || Offset == ChainPred->blocks().size()) + return; + // Skip merging if it breaks Forced successors + auto BB = ChainPred->blocks()[Offset - 1]; + if (BB->ForcedSucc != nullptr) + return; + // Apply the merge, compute the corresponding gain, and update the best + // value, if the merge is beneficial + for (auto &MergeType : MergeTypes) { + Gain.updateIfLessThan( + computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType)); + } + }; + + // Try to concatenate two chains w/o splitting + Gain.updateIfLessThan( + computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y)); + + if (EnableChainSplitAlongJumps) { + // Attach (a part of) ChainPred before the first block of ChainSucc + for (auto &Jump : ChainSucc->blocks().front()->InJumps) { + const auto SrcBlock = Jump->Source; + if (SrcBlock->CurChain != ChainPred) + continue; + size_t Offset = SrcBlock->CurIndex + 1; + tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y}); + } + + // Attach (a part of) ChainPred after the last block of ChainSucc + for (auto &Jump : ChainSucc->blocks().back()->OutJumps) { + const auto DstBlock = Jump->Source; + if (DstBlock->CurChain != ChainPred) + continue; + size_t Offset = DstBlock->CurIndex; + tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1}); + } + } + + // Try to break ChainPred in various ways and concatenate with ChainSucc + if (ChainPred->blocks().size() <= ChainSplitThreshold) { + for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) { + // Try to split the chain in different ways. In practice, applying + // X2_Y_X1 merging is almost never provides benefits; thus, we exclude + // it from consideration to reduce the search space + tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1, + MergeTypeTy::X2_X1_Y}); + } + } + Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain); + return Gain; + } + + /// Compute the score gain of merging two chains, respecting a given + /// merge 'type' and 'offset'. + /// + /// The two chains are not modified in the method. + MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc, + const std::vector<Jump *> &Jumps, + size_t MergeOffset, + MergeTypeTy MergeType) const { + auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(), + MergeOffset, MergeType); + + // Do not allow a merge that does not preserve the original entry block + if ((ChainPred->isEntry() || ChainSucc->isEntry()) && + !MergedBlocks.getFirstBlock()->isEntry()) + return MergeGainTy(); + + // The gain for the new chain + auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score(); + return MergeGainTy(NewGainScore, MergeOffset, MergeType); + } + + /// Merge two chains of blocks respecting a given merge 'type' and 'offset'. + /// + /// If MergeType == 0, then the result is a concatentation of two chains. + /// Otherwise, the first chain is cut into two sub-chains at the offset, + /// and merged using all possible ways of concatenating three chains. + MergedChain mergeBlocks(const std::vector<Block *> &X, + const std::vector<Block *> &Y, size_t MergeOffset, + MergeTypeTy MergeType) const { + // Split the first chain, X, into X1 and X2 + BlockIter BeginX1 = X.begin(); + BlockIter EndX1 = X.begin() + MergeOffset; + BlockIter BeginX2 = X.begin() + MergeOffset; + BlockIter EndX2 = X.end(); + BlockIter BeginY = Y.begin(); + BlockIter EndY = Y.end(); + + // Construct a new chain from the three existing ones + switch (MergeType) { + case MergeTypeTy::X_Y: + return MergedChain(BeginX1, EndX2, BeginY, EndY); + case MergeTypeTy::X1_Y_X2: + return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); + case MergeTypeTy::Y_X2_X1: + return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); + case MergeTypeTy::X2_X1_Y: + return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); + } + llvm_unreachable("unexpected chain merge type"); + } + + /// Merge chain From into chain Into, update the list of active chains, + /// adjacency information, and the corresponding cached values. + void mergeChains(Chain *Into, Chain *From, size_t MergeOffset, + MergeTypeTy MergeType) { + assert(Into != From && "a chain cannot be merged with itself"); + + // Merge the blocks + auto MergedBlocks = + mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType); + Into->merge(From, MergedBlocks.getBlocks()); + Into->mergeEdges(From); + From->clear(); + + // Update cached ext-tsp score for the new chain + auto SelfEdge = Into->getEdge(Into); + if (SelfEdge != nullptr) { + MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end()); + Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps())); + } + + // Remove chain From from the list of active chains + auto Iter = std::remove(HotChains.begin(), HotChains.end(), From); + HotChains.erase(Iter, HotChains.end()); + + // Invalidate caches + for (auto EdgeIter : Into->edges()) { + EdgeIter.second->invalidateCache(); + } + } + + /// Concatenate all chains into a final order of blocks. + void concatChains(std::vector<uint64_t> &Order) { + // Collect chains and calculate some stats for their sorting + std::vector<Chain *> SortedChains; + DenseMap<const Chain *, double> ChainDensity; + for (auto &Chain : AllChains) { + if (!Chain.blocks().empty()) { + SortedChains.push_back(&Chain); + // Using doubles to avoid overflow of ExecutionCount + double Size = 0; + double ExecutionCount = 0; + for (auto Block : Chain.blocks()) { + Size += static_cast<double>(Block->Size); + ExecutionCount += static_cast<double>(Block->ExecutionCount); + } + assert(Size > 0 && "a chain of zero size"); + ChainDensity[&Chain] = ExecutionCount / Size; + } + } + + // Sorting chains by density in the decreasing order + std::stable_sort(SortedChains.begin(), SortedChains.end(), + [&](const Chain *C1, const Chain *C2) { + // Makre sure the original entry block is at the + // beginning of the order + if (C1->isEntry() != C2->isEntry()) { + return C1->isEntry(); + } + + const double D1 = ChainDensity[C1]; + const double D2 = ChainDensity[C2]; + // Compare by density and break ties by chain identifiers + return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id()); + }); + + // Collect the blocks in the order specified by their chains + Order.reserve(NumNodes); + for (auto Chain : SortedChains) { + for (auto Block : Chain->blocks()) { + Order.push_back(Block->Index); + } + } + } + +private: + /// The number of nodes in the graph. + const size_t NumNodes; + + /// Successors of each node. + std::vector<std::vector<uint64_t>> SuccNodes; + + /// Predecessors of each node. + std::vector<std::vector<uint64_t>> PredNodes; + + /// All basic blocks. + std::vector<Block> AllBlocks; + + /// All jumps between blocks. + std::vector<Jump> AllJumps; + + /// All chains of basic blocks. + std::vector<Chain> AllChains; + + /// All edges between chains. + std::vector<ChainEdge> AllEdges; + + /// Active chains. The vector gets updated at runtime when chains are merged. + std::vector<Chain *> HotChains; +}; + +} // end of anonymous namespace + +std::vector<uint64_t> llvm::applyExtTspLayout( + const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) { + size_t NumNodes = NodeSizes.size(); + + // Verify correctness of the input data. + assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input"); + assert(NumNodes > 2 && "Incorrect input"); + + // Apply the reordering algorithm. + auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts); + std::vector<uint64_t> Result; + Alg.run(Result); + + // Verify correctness of the output. + assert(Result.front() == 0 && "Original entry point is not preserved"); + assert(Result.size() == NumNodes && "Incorrect size of reordered layout"); + return Result; +} + +double llvm::calcExtTspScore( + const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) { + // Estimate addresses of the blocks in memory + auto Addr = std::vector<uint64_t>(NodeSizes.size(), 0); + for (size_t Idx = 1; Idx < Order.size(); Idx++) { + Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; + } + + // Increase the score for each jump + double Score = 0; + for (auto It : EdgeCounts) { + auto Pred = It.first.first; + auto Succ = It.first.second; + uint64_t Count = It.second; + Score += extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count); + } + return Score; +} + +double llvm::calcExtTspScore( + const std::vector<uint64_t> &NodeSizes, + const std::vector<uint64_t> &NodeCounts, + const DenseMap<std::pair<uint64_t, uint64_t>, uint64_t> &EdgeCounts) { + auto Order = std::vector<uint64_t>(NodeSizes.size()); + for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) { + Order[Idx] = Idx; + } + return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); +} diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index fc7083b0c30d..589622d69578 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -596,7 +596,7 @@ bool llvm::checkDebugInfoMetadata(Module &M, auto DILocsBefore = DIPreservationMap[NameOfWrappedPass].DILocations; auto DILocsAfter = DIPreservationAfter[NameOfWrappedPass].DILocations; - auto InstToDelete = DIPreservationAfter[NameOfWrappedPass].InstToDelete; + auto InstToDelete = DIPreservationMap[NameOfWrappedPass].InstToDelete; auto DIVarsBefore = DIPreservationMap[NameOfWrappedPass].DIVariables; auto DIVarsAfter = DIPreservationAfter[NameOfWrappedPass].DIVariables; diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp index 326864803d7c..06596f7b04e1 100644 --- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp +++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp @@ -58,6 +58,14 @@ int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const { return 0; } +int FunctionComparator::cmpAligns(Align L, Align R) const { + if (L.value() < R.value()) + return -1; + if (L.value() > R.value()) + return 1; + return 0; +} + int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const { if ((int)L < (int)R) return -1; @@ -556,13 +564,12 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpTypes(AI->getAllocatedType(), cast<AllocaInst>(R)->getAllocatedType())) return Res; - return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment()); + return cmpAligns(AI->getAlign(), cast<AllocaInst>(R)->getAlign()); } if (const LoadInst *LI = dyn_cast<LoadInst>(L)) { if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile())) return Res; - if (int Res = - cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment())) + if (int Res = cmpAligns(LI->getAlign(), cast<LoadInst>(R)->getAlign())) return Res; if (int Res = cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering())) @@ -578,8 +585,7 @@ int FunctionComparator::cmpOperations(const Instruction *L, if (int Res = cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile())) return Res; - if (int Res = - cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment())) + if (int Res = cmpAligns(SI->getAlign(), cast<StoreInst>(R)->getAlign())) return Res; if (int Res = cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering())) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index ec926b1f5a94..ecad79b68185 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -402,6 +402,18 @@ bool llvm::isInstructionTriviallyDead(Instruction *I, return wouldInstructionBeTriviallyDead(I, TLI); } +bool llvm::wouldInstructionBeTriviallyDeadOnUnusedPaths( + Instruction *I, const TargetLibraryInfo *TLI) { + // Instructions that are "markers" and have implied meaning on code around + // them (without explicit uses), are not dead on unused paths. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) + if (II->getIntrinsicID() == Intrinsic::stacksave || + II->getIntrinsicID() == Intrinsic::launder_invariant_group || + II->isLifetimeStartOrEnd()) + return false; + return wouldInstructionBeTriviallyDead(I, TLI); +} + bool llvm::wouldInstructionBeTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI) { if (I->isTerminator()) diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index f3cf42be8ba1..69fd110dc3c2 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -104,9 +104,7 @@ bool llvm::canPeel(Loop *L) { // note that LoopPeeling currently can only update the branch weights of latch // blocks and branch weights to blocks with deopt or unreachable do not need // updating. - return all_of(Exits, [](const BasicBlock *BB) { - return IsBlockFollowedByDeoptOrUnreachable(BB); - }); + return llvm::all_of(Exits, IsBlockFollowedByDeoptOrUnreachable); } // This function calculates the number of iterations after which the given Phi @@ -333,6 +331,31 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, return DesiredPeelCount; } +/// This "heuristic" exactly matches implicit behavior which used to exist +/// inside getLoopEstimatedTripCount. It was added here to keep an +/// improvement inside that API from causing peeling to become more agressive. +/// This should probably be removed. +static bool violatesLegacyMultiExitLoopCheck(Loop *L) { + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch) + return true; + + BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator()); + if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch)) + return true; + + assert((LatchBR->getSuccessor(0) == L->getHeader() || + LatchBR->getSuccessor(1) == L->getHeader()) && + "At least one edge out of the latch must go to the header"); + + SmallVector<BasicBlock *, 4> ExitBlocks; + L->getUniqueNonLatchExitBlocks(ExitBlocks); + return any_of(ExitBlocks, [](const BasicBlock *EB) { + return !EB->getTerminatingDeoptimizeCall(); + }); +} + + // Return the number of iterations we want to peel off. void llvm::computePeelCount(Loop *L, unsigned LoopSize, TargetTransformInfo::PeelingPreferences &PP, @@ -436,6 +459,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, // We only do this in the presence of profile information, since otherwise // our estimates of the trip count are not reliable enough. if (L->getHeader()->getParent()->hasProfileData()) { + if (violatesLegacyMultiExitLoopCheck(L)) + return; Optional<unsigned> PeelCount = getLoopEstimatedTripCount(L); if (!PeelCount) return; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index c8e42acdffb3..93157bd87c34 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -773,8 +773,8 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE, } -/// Checks if \p L has single exit through latch block except possibly -/// "deoptimizing" exits. Returns branch instruction terminating the loop +/// Checks if \p L has an exiting latch branch. There may also be other +/// exiting blocks. Returns branch instruction terminating the loop /// latch if above check is successful, nullptr otherwise. static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) { BasicBlock *Latch = L->getLoopLatch(); @@ -789,53 +789,61 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) { LatchBR->getSuccessor(1) == L->getHeader()) && "At least one edge out of the latch must go to the header"); - SmallVector<BasicBlock *, 4> ExitBlocks; - L->getUniqueNonLatchExitBlocks(ExitBlocks); - if (any_of(ExitBlocks, [](const BasicBlock *EB) { - return !EB->getTerminatingDeoptimizeCall(); - })) - return nullptr; - return LatchBR; } -Optional<unsigned> -llvm::getLoopEstimatedTripCount(Loop *L, - unsigned *EstimatedLoopInvocationWeight) { - // Support loops with an exiting latch and other existing exists only - // deoptimize. - BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); - if (!LatchBranch) - return None; - +/// Return the estimated trip count for any exiting branch which dominates +/// the loop latch. +static Optional<uint64_t> +getEstimatedTripCount(BranchInst *ExitingBranch, Loop *L, + uint64_t &OrigExitWeight) { // To estimate the number of times the loop body was executed, we want to // know the number of times the backedge was taken, vs. the number of times // we exited the loop. - uint64_t BackedgeTakenWeight, LatchExitWeight; - if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) + uint64_t LoopWeight, ExitWeight; + if (!ExitingBranch->extractProfMetadata(LoopWeight, ExitWeight)) return None; - if (LatchBranch->getSuccessor(0) != L->getHeader()) - std::swap(BackedgeTakenWeight, LatchExitWeight); + if (L->contains(ExitingBranch->getSuccessor(1))) + std::swap(LoopWeight, ExitWeight); - if (!LatchExitWeight) + if (!ExitWeight) + // Don't have a way to return predicated infinite return None; - if (EstimatedLoopInvocationWeight) - *EstimatedLoopInvocationWeight = LatchExitWeight; + OrigExitWeight = ExitWeight; - // Estimated backedge taken count is a ratio of the backedge taken weight by - // the weight of the edge exiting the loop, rounded to nearest. - uint64_t BackedgeTakenCount = - llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight); - // Estimated trip count is one plus estimated backedge taken count. - return BackedgeTakenCount + 1; + // Estimated exit count is a ratio of the loop weight by the weight of the + // edge exiting the loop, rounded to nearest. + uint64_t ExitCount = llvm::divideNearest(LoopWeight, ExitWeight); + // Estimated trip count is one plus estimated exit count. + return ExitCount + 1; +} + +Optional<unsigned> +llvm::getLoopEstimatedTripCount(Loop *L, + unsigned *EstimatedLoopInvocationWeight) { + // Currently we take the estimate exit count only from the loop latch, + // ignoring other exiting blocks. This can overestimate the trip count + // if we exit through another exit, but can never underestimate it. + // TODO: incorporate information from other exits + if (BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L)) { + uint64_t ExitWeight; + if (Optional<uint64_t> EstTripCount = + getEstimatedTripCount(LatchBranch, L, ExitWeight)) { + if (EstimatedLoopInvocationWeight) + *EstimatedLoopInvocationWeight = ExitWeight; + return *EstTripCount; + } + } + return None; } bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount, unsigned EstimatedloopInvocationWeight) { - // Support loops with an exiting latch and other existing exists only - // deoptimize. + // At the moment, we currently support changing the estimate trip count of + // the latch branch only. We could extend this API to manipulate estimated + // trip counts for any exit. BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L); if (!LatchBranch) return false; @@ -923,8 +931,7 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, // Helper to generate an ordered reduction. Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, - unsigned Op, RecurKind RdxKind, - ArrayRef<Value *> RedOps) { + unsigned Op, RecurKind RdxKind) { unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements(); // Extract and apply reduction ops in ascending order: @@ -942,9 +949,6 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, "Invalid min/max"); Result = createMinMaxOp(Builder, RdxKind, Result, Ext); } - - if (!RedOps.empty()) - propagateIRFlags(Result, RedOps); } return Result; @@ -952,14 +956,20 @@ Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src, // Helper to generate a log2 shuffle reduction. Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, - unsigned Op, RecurKind RdxKind, - ArrayRef<Value *> RedOps) { + unsigned Op, RecurKind RdxKind) { unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each // round. assert(isPowerOf2_32(VF) && "Reduction emission only supported for pow2 vectors!"); + // Note: fast-math-flags flags are controlled by the builder configuration + // and are assumed to apply to all generated arithmetic instructions. Other + // poison generating flags (nsw/nuw/inbounds/inrange/exact) are not part + // of the builder configuration, and since they're not passed explicitly, + // will never be relevant here. Note that it would be generally unsound to + // propagate these from an intrinsic call to the expansion anyways as we/ + // change the order of operations. Value *TmpVec = Src; SmallVector<int, 32> ShuffleMask(VF); for (unsigned i = VF; i != 1; i >>= 1) { @@ -973,7 +983,6 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf"); if (Op != Instruction::ICmp && Op != Instruction::FCmp) { - // The builder propagates its fast-math-flags setting. TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"); } else { @@ -981,13 +990,6 @@ Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, "Invalid min/max"); TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf); } - if (!RedOps.empty()) - propagateIRFlags(TmpVec, RedOps); - - // We may compute the reassociated scalar ops in a way that does not - // preserve nsw/nuw etc. Conservatively, drop those flags. - if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec)) - ReductionInst->dropPoisonGeneratingFlags(); } // The result is in the first element of the vector. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); @@ -1035,8 +1037,7 @@ Value *llvm::createSelectCmpTargetReduction(IRBuilderBase &Builder, Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, const TargetTransformInfo *TTI, - Value *Src, RecurKind RdxKind, - ArrayRef<Value *> RedOps) { + Value *Src, RecurKind RdxKind) { auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType(); switch (RdxKind) { case RecurKind::Add: diff --git a/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/llvm/lib/Transforms/Utils/MetaRenamer.cpp index 3ce10535d45f..9fba2f3f86b5 100644 --- a/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Utils/MetaRenamer.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -31,10 +32,36 @@ #include "llvm/IR/TypeFinder.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils.h" using namespace llvm; +static cl::opt<std::string> RenameExcludeFunctionPrefixes( + "rename-exclude-function-prefixes", + cl::desc("Prefixes for functions that don't need to be renamed, separated " + "by a comma"), + cl::Hidden); + +static cl::opt<std::string> RenameExcludeAliasPrefixes( + "rename-exclude-alias-prefixes", + cl::desc("Prefixes for aliases that don't need to be renamed, separated " + "by a comma"), + cl::Hidden); + +static cl::opt<std::string> RenameExcludeGlobalPrefixes( + "rename-exclude-global-prefixes", + cl::desc( + "Prefixes for global values that don't need to be renamed, separated " + "by a comma"), + cl::Hidden); + +static cl::opt<std::string> RenameExcludeStructPrefixes( + "rename-exclude-struct-prefixes", + cl::desc("Prefixes for structs that don't need to be renamed, separated " + "by a comma"), + cl::Hidden); + static const char *const metaNames[] = { // See http://en.wikipedia.org/wiki/Metasyntactic_variable "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge", @@ -66,6 +93,18 @@ struct Renamer { PRNG prng; }; +static void +parseExcludedPrefixes(StringRef PrefixesStr, + SmallVectorImpl<StringRef> &ExcludedPrefixes) { + for (;;) { + auto PrefixesSplit = PrefixesStr.split(','); + if (PrefixesSplit.first.empty()) + break; + ExcludedPrefixes.push_back(PrefixesSplit.first); + PrefixesStr = PrefixesSplit.second; + } +} + void MetaRename(Function &F) { for (Argument &Arg : F.args()) if (!Arg.getType()->isVoidTy()) @@ -91,10 +130,26 @@ void MetaRename(Module &M, Renamer renamer(randSeed); + SmallVector<StringRef, 8> ExcludedAliasesPrefixes; + SmallVector<StringRef, 8> ExcludedGlobalsPrefixes; + SmallVector<StringRef, 8> ExcludedStructsPrefixes; + SmallVector<StringRef, 8> ExcludedFuncPrefixes; + parseExcludedPrefixes(RenameExcludeAliasPrefixes, ExcludedAliasesPrefixes); + parseExcludedPrefixes(RenameExcludeGlobalPrefixes, ExcludedGlobalsPrefixes); + parseExcludedPrefixes(RenameExcludeStructPrefixes, ExcludedStructsPrefixes); + parseExcludedPrefixes(RenameExcludeFunctionPrefixes, ExcludedFuncPrefixes); + + auto IsNameExcluded = [](StringRef &Name, + SmallVectorImpl<StringRef> &ExcludedPrefixes) { + return any_of(ExcludedPrefixes, + [&Name](auto &Prefix) { return Name.startswith(Prefix); }); + }; + // Rename all aliases for (GlobalAlias &GA : M.aliases()) { StringRef Name = GA.getName(); - if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) + if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) || + IsNameExcluded(Name, ExcludedAliasesPrefixes)) continue; GA.setName("alias"); @@ -103,7 +158,8 @@ void MetaRename(Module &M, // Rename all global variables for (GlobalVariable &GV : M.globals()) { StringRef Name = GV.getName(); - if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1)) + if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) || + IsNameExcluded(Name, ExcludedGlobalsPrefixes)) continue; GV.setName("global"); @@ -113,7 +169,9 @@ void MetaRename(Module &M, TypeFinder StructTypes; StructTypes.run(M, true); for (StructType *STy : StructTypes) { - if (STy->isLiteral() || STy->getName().empty()) + StringRef Name = STy->getName(); + if (STy->isLiteral() || Name.empty() || + IsNameExcluded(Name, ExcludedStructsPrefixes)) continue; SmallString<128> NameStorage; @@ -128,7 +186,8 @@ void MetaRename(Module &M, // Leave library functions alone because their presence or absence could // affect the behavior of other passes. if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) || - GetTLI(F).getLibFunc(F, Tmp)) + GetTLI(F).getLibFunc(F, Tmp) || + IsNameExcluded(Name, ExcludedFuncPrefixes)) continue; // Leave @main alone. The output of -metarenamer might be passed to diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp index 3ebc89158173..65207056a3f4 100644 --- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp +++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp @@ -144,6 +144,10 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) { Value *Offset = Builder.CreateShl(Index, ConstantInt::get(IntTy, 2), "reltable.shift"); + // Insert the call to load.relative instrinsic before LOAD. + // GEP might not be immediately followed by a LOAD, like it can be hoisted + // outside the loop or another instruction might be inserted them in between. + Builder.SetInsertPoint(Load); Function *LoadRelIntrinsic = llvm::Intrinsic::getDeclaration( &M, Intrinsic::load_relative, {Index->getType()}); Value *Base = Builder.CreateBitCast(RelLookupTable, Builder.getInt8PtrTy()); diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp index 9495e442e0bf..2f2dff6b5f0b 100644 --- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -220,7 +220,7 @@ private: Now = Pred; } - assert(PathCapacity > 0 && "found incorrect augmenting path"); + assert(PathCapacity > 0 && "found an incorrect augmenting path"); // Update the flow along the path Now = Target; @@ -271,6 +271,352 @@ private: uint64_t Target; }; +/// A post-processing adjustment of control flow. It applies two steps by +/// rerouting some flow and making it more realistic: +/// +/// - First, it removes all isolated components ("islands") with a positive flow +/// that are unreachable from the entry block. For every such component, we +/// find the shortest from the entry to an exit passing through the component, +/// and increase the flow by one unit along the path. +/// +/// - Second, it identifies all "unknown subgraphs" consisting of basic blocks +/// with no sampled counts. Then it rebalnces the flow that goes through such +/// a subgraph so that each branch is taken with probability 50%. +/// An unknown subgraph is such that for every two nodes u and v: +/// - u dominates v and u is not unknown; +/// - v post-dominates u; and +/// - all inner-nodes of all (u,v)-paths are unknown. +/// +class FlowAdjuster { +public: + FlowAdjuster(FlowFunction &Func) : Func(Func) { + assert(Func.Blocks[Func.Entry].isEntry() && + "incorrect index of the entry block"); + } + + // Run the post-processing + void run() { + /// Adjust the flow to get rid of isolated components. + joinIsolatedComponents(); + + /// Rebalance the flow inside unknown subgraphs. + rebalanceUnknownSubgraphs(); + } + + /// The probability for the first successor of a unknown subgraph + static constexpr double UnknownFirstSuccProbability = 0.5; + +private: + void joinIsolatedComponents() { + // Find blocks that are reachable from the source + auto Visited = std::vector<bool>(NumBlocks(), false); + findReachable(Func.Entry, Visited); + + // Iterate over all non-reachable blocks and adjust their weights + for (uint64_t I = 0; I < NumBlocks(); I++) { + auto &Block = Func.Blocks[I]; + if (Block.Flow > 0 && !Visited[I]) { + // Find a path from the entry to an exit passing through the block I + auto Path = findShortestPath(I); + // Increase the flow along the path + assert(Path.size() > 0 && Path[0]->Source == Func.Entry && + "incorrectly computed path adjusting control flow"); + Func.Blocks[Func.Entry].Flow += 1; + for (auto &Jump : Path) { + Jump->Flow += 1; + Func.Blocks[Jump->Target].Flow += 1; + // Update reachability + findReachable(Jump->Target, Visited); + } + } + } + } + + /// Run BFS from a given block along the jumps with a positive flow and mark + /// all reachable blocks. + void findReachable(uint64_t Src, std::vector<bool> &Visited) { + if (Visited[Src]) + return; + std::queue<uint64_t> Queue; + Queue.push(Src); + Visited[Src] = true; + while (!Queue.empty()) { + Src = Queue.front(); + Queue.pop(); + for (auto Jump : Func.Blocks[Src].SuccJumps) { + uint64_t Dst = Jump->Target; + if (Jump->Flow > 0 && !Visited[Dst]) { + Queue.push(Dst); + Visited[Dst] = true; + } + } + } + } + + /// Find the shortest path from the entry block to an exit block passing + /// through a given block. + std::vector<FlowJump *> findShortestPath(uint64_t BlockIdx) { + // A path from the entry block to BlockIdx + auto ForwardPath = findShortestPath(Func.Entry, BlockIdx); + // A path from BlockIdx to an exit block + auto BackwardPath = findShortestPath(BlockIdx, AnyExitBlock); + + // Concatenate the two paths + std::vector<FlowJump *> Result; + Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end()); + Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end()); + return Result; + } + + /// Apply the Dijkstra algorithm to find the shortest path from a given + /// Source to a given Target block. + /// If Target == -1, then the path ends at an exit block. + std::vector<FlowJump *> findShortestPath(uint64_t Source, uint64_t Target) { + // Quit early, if possible + if (Source == Target) + return std::vector<FlowJump *>(); + if (Func.Blocks[Source].isExit() && Target == AnyExitBlock) + return std::vector<FlowJump *>(); + + // Initialize data structures + auto Distance = std::vector<int64_t>(NumBlocks(), INF); + auto Parent = std::vector<FlowJump *>(NumBlocks(), nullptr); + Distance[Source] = 0; + std::set<std::pair<uint64_t, uint64_t>> Queue; + Queue.insert(std::make_pair(Distance[Source], Source)); + + // Run the Dijkstra algorithm + while (!Queue.empty()) { + uint64_t Src = Queue.begin()->second; + Queue.erase(Queue.begin()); + // If we found a solution, quit early + if (Src == Target || + (Func.Blocks[Src].isExit() && Target == AnyExitBlock)) + break; + + for (auto Jump : Func.Blocks[Src].SuccJumps) { + uint64_t Dst = Jump->Target; + int64_t JumpDist = jumpDistance(Jump); + if (Distance[Dst] > Distance[Src] + JumpDist) { + Queue.erase(std::make_pair(Distance[Dst], Dst)); + + Distance[Dst] = Distance[Src] + JumpDist; + Parent[Dst] = Jump; + + Queue.insert(std::make_pair(Distance[Dst], Dst)); + } + } + } + // If Target is not provided, find the closest exit block + if (Target == AnyExitBlock) { + for (uint64_t I = 0; I < NumBlocks(); I++) { + if (Func.Blocks[I].isExit() && Parent[I] != nullptr) { + if (Target == AnyExitBlock || Distance[Target] > Distance[I]) { + Target = I; + } + } + } + } + assert(Parent[Target] != nullptr && "a path does not exist"); + + // Extract the constructed path + std::vector<FlowJump *> Result; + uint64_t Now = Target; + while (Now != Source) { + assert(Now == Parent[Now]->Target && "incorrect parent jump"); + Result.push_back(Parent[Now]); + Now = Parent[Now]->Source; + } + // Reverse the path, since it is extracted from Target to Source + std::reverse(Result.begin(), Result.end()); + return Result; + } + + /// A distance of a path for a given jump. + /// In order to incite the path to use blocks/jumps with large positive flow, + /// and avoid changing branch probability of outgoing edges drastically, + /// set the distance as follows: + /// if Jump.Flow > 0, then distance = max(100 - Jump->Flow, 0) + /// if Block.Weight > 0, then distance = 1 + /// otherwise distance >> 1 + int64_t jumpDistance(FlowJump *Jump) const { + int64_t BaseDistance = 100; + if (Jump->IsUnlikely) + return MinCostMaxFlow::AuxCostUnlikely; + if (Jump->Flow > 0) + return std::max(BaseDistance - (int64_t)Jump->Flow, (int64_t)0); + if (Func.Blocks[Jump->Target].Weight > 0) + return BaseDistance; + return BaseDistance * (NumBlocks() + 1); + }; + + uint64_t NumBlocks() const { return Func.Blocks.size(); } + + /// Rebalance unknown subgraphs so as each branch splits with probabilities + /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability + void rebalanceUnknownSubgraphs() { + assert(UnknownFirstSuccProbability >= 0.0 && + UnknownFirstSuccProbability <= 1.0 && + "the share of the unknown successor should be between 0 and 1"); + // Try to find unknown subgraphs from each non-unknown block + for (uint64_t I = 0; I < Func.Blocks.size(); I++) { + auto SrcBlock = &Func.Blocks[I]; + // Do not attempt to find unknown successors from a unknown or a + // zero-flow block + if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0) + continue; + + std::vector<FlowBlock *> UnknownSuccs; + FlowBlock *DstBlock = nullptr; + // Find a unknown subgraphs starting at block SrcBlock + if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs)) + continue; + // At the moment, we do not rebalance subgraphs containing cycles among + // unknown blocks + if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs)) + continue; + + // Rebalance the flow + rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs); + } + } + + /// Find a unknown subgraph starting at block SrcBlock. + /// If the search is successful, the method sets DstBlock and UnknownSuccs. + bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock, + std::vector<FlowBlock *> &UnknownSuccs) { + // Run BFS from SrcBlock and make sure all paths are going through unknown + // blocks and end at a non-unknown DstBlock + auto Visited = std::vector<bool>(NumBlocks(), false); + std::queue<uint64_t> Queue; + DstBlock = nullptr; + + Queue.push(SrcBlock->Index); + Visited[SrcBlock->Index] = true; + while (!Queue.empty()) { + auto &Block = Func.Blocks[Queue.front()]; + Queue.pop(); + // Process blocks reachable from Block + for (auto Jump : Block.SuccJumps) { + uint64_t Dst = Jump->Target; + if (Visited[Dst]) + continue; + Visited[Dst] = true; + if (!Func.Blocks[Dst].UnknownWeight) { + // If we see non-unique non-unknown block reachable from SrcBlock, + // stop processing and skip rebalancing + FlowBlock *CandidateDstBlock = &Func.Blocks[Dst]; + if (DstBlock != nullptr && DstBlock != CandidateDstBlock) + return false; + DstBlock = CandidateDstBlock; + } else { + Queue.push(Dst); + UnknownSuccs.push_back(&Func.Blocks[Dst]); + } + } + } + + // If the list of unknown blocks is empty, we don't need rebalancing + if (UnknownSuccs.empty()) + return false; + // If all reachable nodes from SrcBlock are unknown, skip rebalancing + if (DstBlock == nullptr) + return false; + // If any of the unknown blocks is an exit block, skip rebalancing + for (auto Block : UnknownSuccs) { + if (Block->isExit()) + return false; + } + + return true; + } + + /// Verify if the given unknown subgraph is acyclic, and if yes, reorder + /// UnknownSuccs in the topological order (so that all jumps are "forward"). + bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, + std::vector<FlowBlock *> &UnknownSuccs) { + // Extract local in-degrees in the considered subgraph + auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0); + for (auto Jump : SrcBlock->SuccJumps) { + LocalInDegree[Jump->Target]++; + } + for (uint64_t I = 0; I < UnknownSuccs.size(); I++) { + for (auto Jump : UnknownSuccs[I]->SuccJumps) { + LocalInDegree[Jump->Target]++; + } + } + // A loop containing SrcBlock + if (LocalInDegree[SrcBlock->Index] > 0) + return false; + + std::vector<FlowBlock *> AcyclicOrder; + std::queue<uint64_t> Queue; + Queue.push(SrcBlock->Index); + while (!Queue.empty()) { + auto &Block = Func.Blocks[Queue.front()]; + Queue.pop(); + // Stop propagation once we reach DstBlock + if (Block.Index == DstBlock->Index) + break; + + AcyclicOrder.push_back(&Block); + // Add to the queue all successors with zero local in-degree + for (auto Jump : Block.SuccJumps) { + uint64_t Dst = Jump->Target; + LocalInDegree[Dst]--; + if (LocalInDegree[Dst] == 0) { + Queue.push(Dst); + } + } + } + + // If there is a cycle in the subgraph, AcyclicOrder contains only a subset + // of all blocks + if (UnknownSuccs.size() + 1 != AcyclicOrder.size()) + return false; + UnknownSuccs = AcyclicOrder; + return true; + } + + /// Rebalance a given subgraph. + void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, + std::vector<FlowBlock *> &UnknownSuccs) { + assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph"); + assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns"); + + for (auto Block : UnknownSuccs) { + // Block's flow is the sum of incoming flows + uint64_t TotalFlow = 0; + if (Block == SrcBlock) { + TotalFlow = Block->Flow; + } else { + for (auto Jump : Block->PredJumps) { + TotalFlow += Jump->Flow; + } + Block->Flow = TotalFlow; + } + + // Process all successor jumps and update corresponding flow values + for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) { + auto Jump = Block->SuccJumps[I]; + if (I + 1 == Block->SuccJumps.size()) { + Jump->Flow = TotalFlow; + continue; + } + uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability); + Jump->Flow = Flow; + TotalFlow -= Flow; + } + } + } + + /// A constant indicating an arbitrary exit block of a function. + static constexpr uint64_t AnyExitBlock = uint64_t(-1); + + /// The function. + FlowFunction &Func; +}; + /// Initializing flow network for a given function. /// /// Every block is split into three nodes that are responsible for (i) an @@ -440,6 +786,39 @@ void verifyWeights(const FlowFunction &Func) { } } assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow"); + + // Verify that there are no isolated flow components + // One could modify FlowFunction to hold edges indexed by the sources, which + // will avoid a creation of the object + auto PositiveFlowEdges = std::vector<std::vector<uint64_t>>(NumBlocks); + for (auto &Jump : Func.Jumps) { + if (Jump.Flow > 0) { + PositiveFlowEdges[Jump.Source].push_back(Jump.Target); + } + } + + // Run BFS from the source along edges with positive flow + std::queue<uint64_t> Queue; + auto Visited = std::vector<bool>(NumBlocks, false); + Queue.push(Func.Entry); + Visited[Func.Entry] = true; + while (!Queue.empty()) { + uint64_t Src = Queue.front(); + Queue.pop(); + for (uint64_t Dst : PositiveFlowEdges[Src]) { + if (!Visited[Dst]) { + Queue.push(Dst); + Visited[Dst] = true; + } + } + } + + // Verify that every block that has a positive flow is reached from the source + // along edges with a positive flow + for (uint64_t I = 0; I < NumBlocks; I++) { + auto &Block = Func.Blocks[I]; + assert((Visited[I] || Block.Flow == 0) && "an isolated flow component"); + } } #endif @@ -455,6 +834,10 @@ void llvm::applyFlowInference(FlowFunction &Func) { // Extract flow values for every block and every edge extractWeights(InferenceNetwork, Func); + // Post-processing adjustments to the flow + auto Adjuster = FlowAdjuster(Func); + Adjuster.run(); + #ifndef NDEBUG // Verify the result verifyWeights(Func); diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 71c15d5c51fc..c840ee85795f 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1047,9 +1047,9 @@ bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) { if (SE.DT.dominates(IncV, InsertPos)) break; } - for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) { - fixupInsertPoints(*I); - (*I)->moveBefore(InsertPos); + for (Instruction *I : llvm::reverse(IVIncs)) { + fixupInsertPoints(I); + I->moveBefore(InsertPos); } return true; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index afa3ecde77f9..1046998c26de 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3629,7 +3629,7 @@ static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, return false; // TODO // Use lambda to lazily compute expensive condition after cheap ones. auto NoSideEffects = [](BasicBlock &BB) { - return !llvm::any_of(BB, [](const Instruction &I) { + return llvm::none_of(BB, [](const Instruction &I) { return I.mayWriteToMemory() || I.mayHaveSideEffects(); }); }; diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index e190a1294eb3..02727a3dbf9c 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -193,6 +193,19 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A } } +// Copy CallInst "flags" like musttail, notail, and tail. Return New param for +// easier chaining. Calls to emit* and B.createCall should probably be wrapped +// in this function when New is created to replace Old. Callers should take +// care to check Old.isMustTailCall() if they aren't replacing Old directly +// with New. +static Value *copyFlags(const CallInst &Old, Value *New) { + assert(!Old.isMustTailCall() && "do not copy musttail call flags"); + assert(!Old.isNoTailCall() && "do not copy notail call flags"); + if (auto *NewCI = dyn_cast_or_null<CallInst>(New)) + NewCI->setTailCallKind(Old.getTailCallKind()); + return New; +} + //===----------------------------------------------------------------------===// // String and Memory Library Call Optimizations //===----------------------------------------------------------------------===// @@ -215,7 +228,7 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) { if (Len == 0) return Dst; - return emitStrLenMemCpy(Src, Dst, Len, B); + return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, Len, B)); } Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, @@ -279,7 +292,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) { // strncat(x, s, c) -> strcat(x, s) // s is constant so the strcat can be optimized further. - return emitStrLenMemCpy(Src, Dst, SrcLen, B); + return copyFlags(*CI, emitStrLenMemCpy(Src, Dst, SrcLen, B)); } Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) { @@ -300,9 +313,11 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) { if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. return nullptr; - return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul. - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), - B, DL, TLI); + return copyFlags( + *CI, + emitMemChr(SrcStr, CI->getArgOperand(1), // include nul. + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len), B, + DL, TLI)); } // Otherwise, the character is a constant, see if the first argument is @@ -340,7 +355,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) { if (!getConstantStringInfo(SrcStr, Str)) { // strrchr(s, 0) -> strchr(s, 0) if (CharC->isZero()) - return emitStrChr(SrcStr, '\0', B, TLI); + return copyFlags(*CI, emitStrChr(SrcStr, '\0', B, TLI)); return nullptr; } @@ -385,25 +400,28 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) { annotateDereferenceableBytes(CI, 1, Len2); if (Len1 && Len2) { - return emitMemCmp(Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - std::min(Len1, Len2)), - B, DL, TLI); + return copyFlags( + *CI, emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + std::min(Len1, Len2)), + B, DL, TLI)); } // strcmp to memcmp if (!HasStr1 && HasStr2) { if (canTransformToMemCmp(CI, Str1P, Len2, DL)) - return emitMemCmp( - Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL, - TLI); + return copyFlags( + *CI, + emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), + B, DL, TLI)); } else if (HasStr1 && !HasStr2) { if (canTransformToMemCmp(CI, Str2P, Len1, DL)) - return emitMemCmp( - Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL, - TLI); + return copyFlags( + *CI, + emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), + B, DL, TLI)); } annotateNonNullNoUndefBasedOnAccess(CI, {0, 1}); @@ -430,7 +448,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) { return ConstantInt::get(CI->getType(), 0); if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) - return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI); + return copyFlags(*CI, emitMemCmp(Str1P, Str2P, Size, B, DL, TLI)); StringRef Str1, Str2; bool HasStr1 = getConstantStringInfo(Str1P, Str1); @@ -462,17 +480,19 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) { if (!HasStr1 && HasStr2) { Len2 = std::min(Len2, Length); if (canTransformToMemCmp(CI, Str1P, Len2, DL)) - return emitMemCmp( - Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL, - TLI); + return copyFlags( + *CI, + emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), + B, DL, TLI)); } else if (HasStr1 && !HasStr2) { Len1 = std::min(Len1, Length); if (canTransformToMemCmp(CI, Str2P, Len1, DL)) - return emitMemCmp( - Str1P, Str2P, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL, - TLI); + return copyFlags( + *CI, + emitMemCmp(Str1P, Str2P, + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), + B, DL, TLI)); } return nullptr; @@ -485,7 +505,7 @@ Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) { if (SrcLen && Size) { annotateDereferenceableBytes(CI, 0, SrcLen); if (SrcLen <= Size->getZExtValue() + 1) - return emitStrDup(Src, B, TLI); + return copyFlags(*CI, emitStrDup(Src, B, TLI)); } return nullptr; @@ -495,7 +515,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) { Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) // strcpy(x,x) -> x return Src; - + annotateNonNullNoUndefBasedOnAccess(CI, {0, 1}); // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); @@ -511,6 +531,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) { ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return Dst; } @@ -520,7 +541,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { // stpcpy(d,s) -> strcpy(d,s) if the result is not used. if (CI->use_empty()) - return emitStrCpy(Dst, Src, B, TLI); + return copyFlags(*CI, emitStrCpy(Dst, Src, B, TLI)); if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) Value *StrLen = emitStrLen(Src, B, DL, TLI); @@ -544,6 +565,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) { CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return DstEnd; } @@ -583,6 +605,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0)); NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( CI->getContext(), 0, ArgAttrs)); + copyFlags(*CI, NewCI); return Dst; } @@ -606,6 +629,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) { ConstantInt::get(DL.getIntPtrType(PT), Len)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return Dst; } @@ -737,7 +761,7 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) { // strpbrk(s, "a") -> strchr(s, 'a') if (HasS2 && S2.size() == 1) - return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI); + return copyFlags(*CI, emitStrChr(CI->getArgOperand(0), S2[0], B, TLI)); return nullptr; } @@ -793,7 +817,7 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) { // strcspn(s, "") -> strlen(s) if (HasS2 && S2.empty()) - return emitStrLen(CI->getArgOperand(0), B, DL, TLI); + return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B, DL, TLI)); return nullptr; } @@ -1062,7 +1086,7 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) { Value *LHS = CI->getArgOperand(0); Value *RHS = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); - return emitBCmp(LHS, RHS, Size, B, DL, TLI); + return copyFlags(*CI, emitBCmp(LHS, RHS, Size, B, DL, TLI)); } return nullptr; @@ -1083,6 +1107,7 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) { CI->getArgOperand(1), Align(1), Size); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } @@ -1110,7 +1135,8 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) { size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF); if (Pos == StringRef::npos) { if (N->getZExtValue() <= SrcStr.size()) { - B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3)); + copyFlags(*CI, B.CreateMemCpy(Dst, Align(1), Src, Align(1), + CI->getArgOperand(3))); return Constant::getNullValue(CI->getType()); } return nullptr; @@ -1119,7 +1145,7 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) { Value *NewN = ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue())); // memccpy -> llvm.memcpy - B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN); + copyFlags(*CI, B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN)); return Pos + 1 <= N->getZExtValue() ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN) : Constant::getNullValue(CI->getType()); @@ -1136,6 +1162,7 @@ Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) { // TODO: Attach return value attributes to the 1st operand to preserve them? NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N); } @@ -1150,6 +1177,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) { CI->getArgOperand(1), Align(1), Size); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } @@ -1164,12 +1192,13 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) { CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) { if (isa<ConstantPointerNull>(CI->getArgOperand(0))) - return emitMalloc(CI->getArgOperand(1), B, DL, TLI); + return copyFlags(*CI, emitMalloc(CI->getArgOperand(1), B, DL, TLI)); return nullptr; } @@ -1190,7 +1219,7 @@ static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B, Function *F = Intrinsic::getDeclaration(M, IID, CI->getType()); CallInst *NewCall = B.CreateCall(F, V); NewCall->takeName(CI); - return NewCall; + return copyFlags(*CI, NewCall); } /// Return a variant of Val with float type. @@ -1311,7 +1340,8 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) { Function *FSqrt = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::sqrt, CI->getType()); - return B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs"); + return copyFlags( + *CI, B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs")); } static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, @@ -1334,14 +1364,16 @@ static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func, // sin(-X) --> -sin(X) // tan(-X) --> -tan(X) if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) - return B.CreateFNeg(B.CreateCall(Call->getCalledFunction(), X)); + return B.CreateFNeg( + copyFlags(*Call, B.CreateCall(Call->getCalledFunction(), X))); break; case LibFunc_cos: case LibFunc_cosf: case LibFunc_cosl: // cos(-X) --> cos(X) if (match(Call->getArgOperand(0), m_FNeg(m_Value(X)))) - return B.CreateCall(Call->getCalledFunction(), X, "cos"); + return copyFlags(*Call, + B.CreateCall(Call->getCalledFunction(), X, "cos")); break; default: break; @@ -1476,9 +1508,10 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) && hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize())) - return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI, - LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl, - B, Attrs); + return copyFlags(*Pow, + emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, + TLI, LibFunc_ldexp, LibFunc_ldexpf, + LibFunc_ldexpl, B, Attrs)); } // pow(2.0 ** n, x) -> exp2(n * x) @@ -1496,11 +1529,13 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { double N = NI.logBase2() * (IsReciprocal ? -1.0 : 1.0); Value *FMul = B.CreateFMul(Expo, ConstantFP::get(Ty, N), "mul"); if (Pow->doesNotAccessMemory()) - return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty), - FMul, "exp2"); + return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration( + Mod, Intrinsic::exp2, Ty), + FMul, "exp2")); else - return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, - LibFunc_exp2l, B, Attrs); + return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, + LibFunc_exp2f, + LibFunc_exp2l, B, Attrs)); } } @@ -1508,8 +1543,9 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { // TODO: There is no exp10() intrinsic yet, but some day there shall be one. if (match(Base, m_SpecificFP(10.0)) && hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l)) - return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f, - LibFunc_exp10l, B, Attrs); + return copyFlags(*Pow, emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, + LibFunc_exp10f, LibFunc_exp10l, + B, Attrs)); // pow(x, y) -> exp2(log2(x) * y) if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() && @@ -1528,11 +1564,13 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) { if (Log) { Value *FMul = B.CreateFMul(Log, Expo, "mul"); if (Pow->doesNotAccessMemory()) - return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty), - FMul, "exp2"); + return copyFlags(*Pow, B.CreateCall(Intrinsic::getDeclaration( + Mod, Intrinsic::exp2, Ty), + FMul, "exp2")); else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) - return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, - LibFunc_exp2l, B, Attrs); + return copyFlags(*Pow, emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, + LibFunc_exp2f, + LibFunc_exp2l, B, Attrs)); } } @@ -1595,6 +1633,8 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) { Sqrt = B.CreateCall(FAbsFn, Sqrt, "abs"); } + Sqrt = copyFlags(*Pow, Sqrt); + // Handle non finite base by expanding to // (x == -infinity ? +infinity : sqrt(x)). if (!Pow->hasNoInfs()) { @@ -1721,15 +1761,18 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) { if (ExpoF->isInteger() && ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) == APFloat::opOK) { - return createPowWithIntegerExponent( - Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo), M, B); + return copyFlags( + *Pow, + createPowWithIntegerExponent( + Base, ConstantInt::get(B.getIntNTy(TLI->getIntSize()), IntExpo), + M, B)); } } // powf(x, itofp(y)) -> powi(x, y) if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) { if (Value *ExpoI = getIntToFPVal(Expo, B, TLI->getIntSize())) - return createPowWithIntegerExponent(Base, ExpoI, M, B); + return copyFlags(*Pow, createPowWithIntegerExponent(Base, ExpoI, M, B)); } // Shrink pow() to powf() if the arguments are single precision, @@ -1792,7 +1835,8 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) { Intrinsic::ID IID = Callee->getName().startswith("fmin") ? Intrinsic::minnum : Intrinsic::maxnum; Function *F = Intrinsic::getDeclaration(CI->getModule(), IID, CI->getType()); - return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) }); + return copyFlags( + *CI, B.CreateCall(F, {CI->getArgOperand(0), CI->getArgOperand(1)})); } Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) { @@ -2010,9 +2054,9 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) { // of the square root calculation. Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType); Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt"); - return B.CreateFMul(FabsCall, SqrtCall); + return copyFlags(*CI, B.CreateFMul(FabsCall, SqrtCall)); } - return FabsCall; + return copyFlags(*CI, FabsCall); } // TODO: Generalize to handle any trig function and its inverse. @@ -2327,7 +2371,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { // printf("x") -> putchar('x'), even for "%" and "%%". if (FormatStr.size() == 1 || FormatStr == "%%") - return emitPutChar(B.getInt32(FormatStr[0]), B, TLI); + return copyFlags(*CI, emitPutChar(B.getInt32(FormatStr[0]), B, TLI)); // Try to remove call or emit putchar/puts. if (FormatStr == "%s" && CI->arg_size() > 1) { @@ -2339,12 +2383,12 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { return (Value *)CI; // printf("%s", "a") --> putchar('a') if (OperandStr.size() == 1) - return emitPutChar(B.getInt32(OperandStr[0]), B, TLI); + return copyFlags(*CI, emitPutChar(B.getInt32(OperandStr[0]), B, TLI)); // printf("%s", str"\n") --> puts(str) if (OperandStr.back() == '\n') { OperandStr = OperandStr.drop_back(); Value *GV = B.CreateGlobalString(OperandStr, "str"); - return emitPutS(GV, B, TLI); + return copyFlags(*CI, emitPutS(GV, B, TLI)); } return nullptr; } @@ -2356,19 +2400,19 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) { // pass to be run after this pass, to merge duplicate strings. FormatStr = FormatStr.drop_back(); Value *GV = B.CreateGlobalString(FormatStr, "str"); - return emitPutS(GV, B, TLI); + return copyFlags(*CI, emitPutS(GV, B, TLI)); } // Optimize specific format strings. // printf("%c", chr) --> putchar(chr) if (FormatStr == "%c" && CI->arg_size() > 1 && CI->getArgOperand(1)->getType()->isIntegerTy()) - return emitPutChar(CI->getArgOperand(1), B, TLI); + return copyFlags(*CI, emitPutChar(CI->getArgOperand(1), B, TLI)); // printf("%s\n", str) --> puts(str) if (FormatStr == "%s\n" && CI->arg_size() > 1 && CI->getArgOperand(1)->getType()->isPointerTy()) - return emitPutS(CI->getArgOperand(1), B, TLI); + return copyFlags(*CI, emitPutS(CI->getArgOperand(1), B, TLI)); return nullptr; } @@ -2459,7 +2503,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, if (CI->use_empty()) // sprintf(dest, "%s", str) -> strcpy(dest, str) - return emitStrCpy(Dest, CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, emitStrCpy(Dest, CI->getArgOperand(2), B, TLI)); uint64_t SrcLen = GetStringLength(CI->getArgOperand(2)); if (SrcLen) { @@ -2558,10 +2602,12 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt, // strlen(fmt)+1) - B.CreateMemCpy( - CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - FormatStr.size() + 1)); // Copy the null byte. + copyFlags( + *CI, + B.CreateMemCpy( + CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + FormatStr.size() + 1))); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); } @@ -2599,8 +2645,10 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, else if (N < Str.size() + 1) return nullptr; - B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3), - Align(1), ConstantInt::get(CI->getType(), Str.size() + 1)); + copyFlags( + *CI, B.CreateMemCpy(CI->getArgOperand(0), Align(1), + CI->getArgOperand(3), Align(1), + ConstantInt::get(CI->getType(), Str.size() + 1))); // The snprintf result is the unincremented number of bytes in the string. return ConstantInt::get(CI->getType(), Str.size()); @@ -2640,10 +2688,11 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, if (FormatStr.contains('%')) return nullptr; // We found a format specifier. - return emitFWrite( - CI->getArgOperand(1), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()), - CI->getArgOperand(0), B, DL, TLI); + return copyFlags( + *CI, emitFWrite(CI->getArgOperand(1), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + FormatStr.size()), + CI->getArgOperand(0), B, DL, TLI)); } // The remaining optimizations require the format string to be "%s" or "%c" @@ -2656,14 +2705,16 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, // fprintf(F, "%c", chr) --> fputc(chr, F) if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return nullptr; - return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); + return copyFlags( + *CI, emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI)); } if (FormatStr[1] == 's') { // fprintf(F, "%s", str) --> fputs(str, F) if (!CI->getArgOperand(2)->getType()->isPointerTy()) return nullptr; - return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI); + return copyFlags( + *CI, emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI)); } return nullptr; } @@ -2750,10 +2801,11 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) { return nullptr; // Known to have no uses (see above). - return emitFWrite( - CI->getArgOperand(0), - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1), - CI->getArgOperand(1), B, DL, TLI); + return copyFlags( + *CI, + emitFWrite(CI->getArgOperand(0), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1), + CI->getArgOperand(1), B, DL, TLI)); } Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) { @@ -2765,15 +2817,16 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) { // puts("") -> putchar('\n') StringRef Str; if (getConstantStringInfo(CI->getArgOperand(0), Str) && Str.empty()) - return emitPutChar(B.getInt32('\n'), B, TLI); + return copyFlags(*CI, emitPutChar(B.getInt32('\n'), B, TLI)); return nullptr; } Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) { // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) - return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0), - Align(1), CI->getArgOperand(2)); + return copyFlags(*CI, B.CreateMemMove(CI->getArgOperand(1), Align(1), + CI->getArgOperand(0), Align(1), + CI->getArgOperand(2))); } bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { @@ -2971,6 +3024,8 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, } Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) { + assert(!CI->isMustTailCall() && "These transforms aren't musttail safe."); + // TODO: Split out the code below that operates on FP calls so that // we can all non-FP calls with the StrictFP attribute to be // optimized. @@ -3212,6 +3267,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, Align(1), CI->getArgOperand(2)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } return nullptr; @@ -3225,6 +3281,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, Align(1), CI->getArgOperand(2)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } return nullptr; @@ -3238,6 +3295,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, CI->getArgOperand(2), Align(1)); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); + copyFlags(*CI, NewCI); return CI->getArgOperand(0); } return nullptr; @@ -3252,7 +3310,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI, CallInst *NewCI = cast<CallInst>(Call); NewCI->setAttributes(CI->getAttributes()); NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType())); - return NewCI; + return copyFlags(*CI, NewCI); } return nullptr; } @@ -3277,9 +3335,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, // string lengths for varying. if (isFortifiedCallFoldable(CI, 2, None, 1)) { if (Func == LibFunc_strcpy_chk) - return emitStrCpy(Dst, Src, B, TLI); + return copyFlags(*CI, emitStrCpy(Dst, Src, B, TLI)); else - return emitStpCpy(Dst, Src, B, TLI); + return copyFlags(*CI, emitStpCpy(Dst, Src, B, TLI)); } if (OnlyLowerUnknownSize) @@ -3303,14 +3361,14 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, // a __memcpy_chk, we still need to return the correct end pointer. if (Ret && Func == LibFunc_stpcpy_chk) return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1)); - return Ret; + return copyFlags(*CI, cast<CallInst>(Ret)); } Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 1, None, 0)) - return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(), - TLI); + return copyFlags(*CI, emitStrLen(CI->getArgOperand(0), B, + CI->getModule()->getDataLayout(), TLI)); return nullptr; } @@ -3319,11 +3377,13 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, LibFunc Func) { if (isFortifiedCallFoldable(CI, 3, 2)) { if (Func == LibFunc_strncpy_chk) - return emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); else - return emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); } return nullptr; @@ -3332,8 +3392,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 4, 3)) - return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), CI->getArgOperand(3), B, TLI); + return copyFlags( + *CI, emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), CI->getArgOperand(3), B, TLI)); return nullptr; } @@ -3342,8 +3403,9 @@ Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) { SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 5)); - return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(4), VariadicArgs, B, TLI); + return copyFlags(*CI, + emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4), VariadicArgs, B, TLI)); } return nullptr; @@ -3353,8 +3415,9 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2, None, None, 1)) { SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 4)); - return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs, - B, TLI); + return copyFlags(*CI, + emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), + VariadicArgs, B, TLI)); } return nullptr; @@ -3363,7 +3426,8 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2)) - return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI); + return copyFlags( + *CI, emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI)); return nullptr; } @@ -3371,8 +3435,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) - return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); return nullptr; } @@ -3380,8 +3445,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) - return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); return nullptr; } @@ -3389,8 +3455,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3)) - return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TLI); + return copyFlags(*CI, + emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), B, TLI)); return nullptr; } @@ -3398,8 +3465,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) - return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(4), CI->getArgOperand(5), B, TLI); + return copyFlags( + *CI, emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(4), CI->getArgOperand(5), B, TLI)); return nullptr; } @@ -3407,8 +3475,9 @@ Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI, IRBuilderBase &B) { if (isFortifiedCallFoldable(CI, 2, None, None, 1)) - return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), - CI->getArgOperand(4), B, TLI); + return copyFlags(*CI, + emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), + CI->getArgOperand(4), B, TLI)); return nullptr; } diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index c3eafd6b2492..b822db938af8 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -450,6 +450,12 @@ Value *Mapper::mapValue(const Value *V) { DSOLocalEquivalent::get(Func), NewTy); } + if (const auto *NC = dyn_cast<NoCFIValue>(C)) { + auto *Val = mapValue(NC->getGlobalValue()); + GlobalValue *GV = cast<GlobalValue>(Val); + return getVM()[NC] = NoCFIValue::get(GV); + } + auto mapValueOrNull = [this](Value *V) { auto Mapped = mapValue(V); assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) && diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 805011191da0..81e5aa223c07 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -55,22 +55,23 @@ static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold( cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma")); -// FIXME: When scalable vectorization is stable enough, change the default -// to SK_PreferFixedWidth. -static cl::opt<LoopVectorizeHints::ScalableForceKind> ScalableVectorization( - "scalable-vectorization", cl::init(LoopVectorizeHints::SK_FixedWidthOnly), - cl::Hidden, - cl::desc("Control whether the compiler can use scalable vectors to " - "vectorize a loop"), - cl::values( - clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", - "Scalable vectorization is disabled."), - clEnumValN(LoopVectorizeHints::SK_PreferFixedWidth, "on", - "Scalable vectorization is available, but favor fixed-width " - "vectorization when the cost is inconclusive."), - clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred", - "Scalable vectorization is available and favored when the " - "cost is inconclusive."))); +static cl::opt<LoopVectorizeHints::ScalableForceKind> + ForceScalableVectorization( + "scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified), + cl::Hidden, + cl::desc("Control whether the compiler can use scalable vectors to " + "vectorize a loop"), + cl::values( + clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", + "Scalable vectorization is disabled."), + clEnumValN( + LoopVectorizeHints::SK_PreferScalable, "preferred", + "Scalable vectorization is available and favored when the " + "cost is inconclusive."), + clEnumValN( + LoopVectorizeHints::SK_PreferScalable, "on", + "Scalable vectorization is available and favored when the " + "cost is inconclusive."))); /// Maximum vectorization interleave count. static const unsigned MaxInterleaveFactor = 16; @@ -95,7 +96,8 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { LoopVectorizeHints::LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, - OptimizationRemarkEmitter &ORE) + OptimizationRemarkEmitter &ORE, + const TargetTransformInfo *TTI) : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), Interleave("interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE), Force("vectorize.enable", FK_Undefined, HK_FORCE), @@ -110,14 +112,32 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, if (VectorizerParams::isInterleaveForced()) Interleave.Value = VectorizerParams::VectorizationInterleave; + // If the metadata doesn't explicitly specify whether to enable scalable + // vectorization, then decide based on the following criteria (increasing + // level of priority): + // - Target default + // - Metadata width + // - Force option (always overrides) + if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) { + if (TTI) + Scalable.Value = TTI->enableScalableVectorization() ? SK_PreferScalable + : SK_FixedWidthOnly; + + if (Width.Value) + // If the width is set, but the metadata says nothing about the scalable + // property, then assume it concerns only a fixed-width UserVF. + // If width is not set, the flag takes precedence. + Scalable.Value = SK_FixedWidthOnly; + } + + // If the flag is set to force any use of scalable vectors, override the loop + // hints. + if (ForceScalableVectorization.getValue() != + LoopVectorizeHints::SK_Unspecified) + Scalable.Value = ForceScalableVectorization.getValue(); + + // Scalable vectorization is disabled if no preference is specified. if ((LoopVectorizeHints::ScalableForceKind)Scalable.Value == SK_Unspecified) - // If the width is set, but the metadata says nothing about the scalable - // property, then assume it concerns only a fixed-width UserVF. - // If width is not set, the flag takes precedence. - Scalable.Value = Width.Value ? SK_FixedWidthOnly : ScalableVectorization; - else if (ScalableVectorization == SK_FixedWidthOnly) - // If the flag is set to disable any use of scalable vectors, override the - // loop hint. Scalable.Value = SK_FixedWidthOnly; if (IsVectorized.Value != 1) @@ -929,7 +949,7 @@ bool LoopVectorizationLegality::canVectorizeFPMath( })); } -bool LoopVectorizationLegality::isInductionPhi(const Value *V) { +bool LoopVectorizationLegality::isInductionPhi(const Value *V) const { Value *In0 = const_cast<Value *>(V); PHINode *PN = dyn_cast_or_null<PHINode>(In0); if (!PN) @@ -938,16 +958,29 @@ bool LoopVectorizationLegality::isInductionPhi(const Value *V) { return Inductions.count(PN); } -bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) { +const InductionDescriptor * +LoopVectorizationLegality::getIntOrFpInductionDescriptor(PHINode *Phi) const { + if (!isInductionPhi(Phi)) + return nullptr; + auto &ID = getInductionVars().find(Phi)->second; + if (ID.getKind() == InductionDescriptor::IK_IntInduction || + ID.getKind() == InductionDescriptor::IK_FpInduction) + return &ID; + return nullptr; +} + +bool LoopVectorizationLegality::isCastedInductionVariable( + const Value *V) const { auto *Inst = dyn_cast<Instruction>(V); return (Inst && InductionCastsToIgnore.count(Inst)); } -bool LoopVectorizationLegality::isInductionVariable(const Value *V) { +bool LoopVectorizationLegality::isInductionVariable(const Value *V) const { return isInductionPhi(V) || isCastedInductionVariable(V); } -bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) { +bool LoopVectorizationLegality::isFirstOrderRecurrence( + const PHINode *Phi) const { return FirstOrderRecurrences.count(Phi); } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index a7d6609f8c56..71eb39a18d2f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -45,16 +45,17 @@ class VPBuilder { VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator(); VPInstruction *createInstruction(unsigned Opcode, - ArrayRef<VPValue *> Operands) { - VPInstruction *Instr = new VPInstruction(Opcode, Operands); + ArrayRef<VPValue *> Operands, DebugLoc DL) { + VPInstruction *Instr = new VPInstruction(Opcode, Operands, DL); if (BB) BB->insert(Instr, InsertPt); return Instr; } VPInstruction *createInstruction(unsigned Opcode, - std::initializer_list<VPValue *> Operands) { - return createInstruction(Opcode, ArrayRef<VPValue *>(Operands)); + std::initializer_list<VPValue *> Operands, + DebugLoc DL) { + return createInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL); } public: @@ -123,30 +124,33 @@ public: /// its underlying Instruction. VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, Instruction *Inst = nullptr) { - VPInstruction *NewVPInst = createInstruction(Opcode, Operands); + DebugLoc DL; + if (Inst) + DL = Inst->getDebugLoc(); + VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL); NewVPInst->setUnderlyingValue(Inst); return NewVPInst; } - VPValue *createNaryOp(unsigned Opcode, - std::initializer_list<VPValue *> Operands, - Instruction *Inst = nullptr) { - return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst); + VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands, + DebugLoc DL) { + return createInstruction(Opcode, Operands, DL); } - VPValue *createNot(VPValue *Operand) { - return createInstruction(VPInstruction::Not, {Operand}); + VPValue *createNot(VPValue *Operand, DebugLoc DL) { + return createInstruction(VPInstruction::Not, {Operand}, DL); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}); + VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL) { + return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL); } - VPValue *createOr(VPValue *LHS, VPValue *RHS) { - return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}); + VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL) { + return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS}, DL); } - VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) { - return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}); + VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, + DebugLoc DL) { + return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL); } //===--------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5ca0adb4242c..4747f34fcc62 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -428,6 +428,8 @@ class GeneratedRTChecks; namespace llvm { +AnalysisKey ShouldRunExtraVectorPasses::Key; + /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -506,8 +508,8 @@ public: /// Widen an integer or floating-point induction variable \p IV. If \p Trunc /// is provided, the integer induction variable will first be truncated to /// the corresponding type. - void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc, - VPValue *Def, VPValue *CastDef, + void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID, + Value *Start, TruncInst *Trunc, VPValue *Def, VPTransformState &State); /// Construct the vector value of a scalarized value \p V one lane at a time. @@ -534,7 +536,7 @@ public: /// Returns true if the reordering of FP operations is not allowed, but we are /// able to vectorize with strict in-order reductions for the given RdxDesc. - bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); + bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc); /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction @@ -619,7 +621,7 @@ protected: /// can also be a truncate instruction. void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, VPValue *Def, - VPValue *CastDef, VPTransformState &State); + VPTransformState &State); /// Create a vector induction phi node based on an existing scalar one. \p /// EntryVal is the value from the original loop that maps to the vector phi @@ -629,7 +631,6 @@ protected: void createVectorIntOrFpInductionPHI(const InductionDescriptor &II, Value *Step, Value *Start, Instruction *EntryVal, VPValue *Def, - VPValue *CastDef, VPTransformState &State); /// Returns true if an instruction \p I should be scalarized instead of @@ -639,29 +640,6 @@ protected: /// Returns true if we should generate a scalar version of \p IV. bool needsScalarInduction(Instruction *IV) const; - /// If there is a cast involved in the induction variable \p ID, which should - /// be ignored in the vectorized loop body, this function records the - /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the - /// cast. We had already proved that the casted Phi is equal to the uncasted - /// Phi in the vectorized loop (under a runtime guard), and therefore - /// there is no need to vectorize the cast - the same value can be used in the - /// vector loop for both the Phi and the cast. - /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified, - /// Otherwise, \p VectorLoopValue is a widened/vectorized value. - /// - /// \p EntryVal is the value from the original loop that maps to the vector - /// phi node and is used to distinguish what is the IV currently being - /// processed - original one (if \p EntryVal is a phi corresponding to the - /// original IV) or the "newly-created" one based on the proof mentioned above - /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the - /// latter case \p EntryVal is a TruncInst and we must not record anything for - /// that IV, but it's error-prone to expect callers of this routine to care - /// about that, hence this explicit parameter. - void recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State, - unsigned Part, unsigned Lane = UINT_MAX); - /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -698,7 +676,8 @@ protected: /// flags, which can be found from the original scalar operations. Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID) const; + const InductionDescriptor &ID, + BasicBlock *VectorHeader) const; /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, /// vector loop preheader, middle block and scalar preheader. Also @@ -1728,7 +1707,8 @@ private: /// disabled or unsupported, then the scalable part will be equal to /// ElementCount::getScalable(0). FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF); + ElementCount UserVF, + bool FoldTailByMasking); /// \return the maximized element count based on the targets vector /// registers and the loop trip-count, but limited to a maximum safe VF. @@ -1741,7 +1721,8 @@ private: ElementCount getMaximizedVFForTarget(unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF); + const ElementCount &MaxSafeVF, + bool FoldTailByMasking); /// \return the maximum legal scalable VF, based on the safe max number /// of elements. @@ -2356,8 +2337,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( const InductionDescriptor &II, Value *Step, Value *Start, - Instruction *EntryVal, VPValue *Def, VPValue *CastDef, - VPTransformState &State) { + Instruction *EntryVal, VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && "Expected either an induction phi-node or a truncate of it!"); @@ -2373,7 +2354,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( } Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0); - Value *SplatStart = Builder.CreateVectorSplat(VF, Start); + Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start); Value *SteppedStart = getStepVector(SplatStart, Zero, Step, II.getInductionOpcode()); @@ -2394,9 +2375,9 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( Type *StepType = Step->getType(); Value *RuntimeVF; if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF); + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); else - RuntimeVF = getRuntimeVF(Builder, StepType, VF); + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); // Create a vector splat to use in the induction update. @@ -2405,8 +2386,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't // handle a constant vector splat. Value *SplatVF = isa<Constant>(Mul) - ? ConstantVector::getSplat(VF, cast<Constant>(Mul)) - : Builder.CreateVectorSplat(VF, Mul); + ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -2420,8 +2401,6 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI( if (isa<TruncInst>(EntryVal)) addMetadata(LastInduction, EntryVal); - recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef, - State, Part); LastInduction = cast<Instruction>( Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); @@ -2455,56 +2434,21 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const { return llvm::any_of(IV->users(), isScalarInst); } -void InnerLoopVectorizer::recordVectorLoopValueForInductionCast( - const InductionDescriptor &ID, const Instruction *EntryVal, - Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State, - unsigned Part, unsigned Lane) { - assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && - "Expected either an induction phi-node or a truncate of it!"); - - // This induction variable is not the phi from the original loop but the - // newly-created IV based on the proof that casted Phi is equal to the - // uncasted Phi in the vectorized loop (under a runtime guard possibly). It - // re-uses the same InductionDescriptor that original IV uses but we don't - // have to do any recording in this case - that is done when original IV is - // processed. - if (isa<TruncInst>(EntryVal)) - return; - - if (!CastDef) { - assert(ID.getCastInsts().empty() && - "there are casts for ID, but no CastDef"); - return; - } - assert(!ID.getCastInsts().empty() && - "there is a CastDef, but no casts for ID"); - // Only the first Cast instruction in the Casts vector is of interest. - // The rest of the Casts (if exist) have no uses outside the - // induction update chain itself. - if (Lane < UINT_MAX) - State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane)); - else - State.set(CastDef, VectorLoopVal, Part); -} - -void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, - TruncInst *Trunc, VPValue *Def, - VPValue *CastDef, +void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, + const InductionDescriptor &ID, + Value *Start, TruncInst *Trunc, + VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; assert((IV->getType()->isIntegerTy() || IV != OldInduction) && "Primary induction variable must have an integer type"); - - auto II = Legal->getInductionVars().find(IV); - assert(II != Legal->getInductionVars().end() && "IV is not an induction"); - - auto ID = II->second; assert(IV->getType() == ID.getStartValue()->getType() && "Types must match"); // The value from the original loop to which we are mapping the new induction // variable. Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV; - auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + auto &DL = EntryVal->getModule()->getDataLayout(); // Generate code for the induction step. Note that induction steps are // required to be loop-invariant @@ -2514,7 +2458,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, if (PSE.getSE()->isSCEVable(IV->getType())) { SCEVExpander Exp(*PSE.getSE(), DL, "induction"); return Exp.expandCodeFor(Step, Step->getType(), - LoopVectorPreHeader->getTerminator()); + State.CFG.VectorPreHeader->getTerminator()); } return cast<SCEVUnknown>(Step)->getValue(); }; @@ -2530,7 +2474,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, ? Builder.CreateSExtOrTrunc(Induction, IV->getType()) : Builder.CreateCast(Instruction::SIToFP, Induction, IV->getType()); - ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID); + ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID, + State.CFG.PrevBB); ScalarIV->setName("offset.idx"); } if (Trunc) { @@ -2548,20 +2493,19 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) { Value *Broadcasted = getBroadcastInstrs(ScalarIV); for (unsigned Part = 0; Part < UF; ++Part) { - assert(!VF.isScalable() && "scalable vectors not yet supported."); + assert(!State.VF.isScalable() && "scalable vectors not yet supported."); Value *StartIdx; if (Step->getType()->isFloatingPointTy()) - StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part); + StartIdx = + getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part); else - StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part); + StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part); Value *EntryPart = getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode()); State.set(Def, EntryPart, Part); if (Trunc) addMetadata(EntryPart, Trunc); - recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef, - State, Part); } }; @@ -2572,7 +2516,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // Now do the actual transformations, and start with creating the step value. Value *Step = CreateStepValue(ID.getStep()); - if (VF.isZero() || VF.isScalar()) { + if (State.VF.isZero() || State.VF.isScalar()) { Value *ScalarIV = CreateScalarIV(Step); CreateSplatIV(ScalarIV, Step); return; @@ -2583,8 +2527,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // least one user in the loop that is not widened. auto NeedsScalarIV = needsScalarInduction(EntryVal); if (!NeedsScalarIV) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, - State); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); return; } @@ -2592,14 +2535,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, // create the phi node, we will splat the scalar induction variable in each // loop iteration. if (!shouldScalarizeInstruction(EntryVal)) { - createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef, - State); + createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State); Value *ScalarIV = CreateScalarIV(Step); // Create scalar steps that can be used by instructions we will later // scalarize. Note that the addition of the scalar steps will not increase // the number of instructions in the loop in the common case prior to // InstCombine. We will be trading one vector extract for each scalar step. - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); return; } @@ -2609,7 +2551,7 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start, Value *ScalarIV = CreateScalarIV(Step); if (!Cost->isScalarEpilogueAllowed()) CreateSplatIV(ScalarIV, Step); - buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State); + buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State); } Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, @@ -2663,10 +2605,11 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx, void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal, const InductionDescriptor &ID, - VPValue *Def, VPValue *CastDef, + VPValue *Def, VPTransformState &State) { + IRBuilder<> &Builder = State.Builder; // We shouldn't have to build scalar steps if we aren't vectorizing. - assert(VF.isVector() && "VF should be greater than one"); + assert(State.VF.isVector() && "VF should be greater than one"); // Get the value type and ensure it and the step have the same integer type. Type *ScalarIVTy = ScalarIV->getType()->getScalarType(); assert(ScalarIVTy == Step->getType() && @@ -2688,33 +2631,32 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, // iteration. If EntryVal is uniform, we only need to generate the first // lane. Otherwise, we generate all VF values. bool IsUniform = - Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF); - unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue(); + Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF); + unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue(); // Compute the scalar steps and save the results in State. Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(), ScalarIVTy->getScalarSizeInBits()); Type *VecIVTy = nullptr; Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; - if (!IsUniform && VF.isScalable()) { - VecIVTy = VectorType::get(ScalarIVTy, VF); - UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF)); - SplatStep = Builder.CreateVectorSplat(VF, Step); - SplatIV = Builder.CreateVectorSplat(VF, ScalarIV); + if (!IsUniform && State.VF.isScalable()) { + VecIVTy = VectorType::get(ScalarIVTy, State.VF); + UnitStepVec = + Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF)); + SplatStep = Builder.CreateVectorSplat(State.VF, Step); + SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV); } - for (unsigned Part = 0; Part < UF; ++Part) { - Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part); - if (!IsUniform && VF.isScalable()) { - auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0); + if (!IsUniform && State.VF.isScalable()) { + auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0); auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec); if (ScalarIVTy->isFloatingPointTy()) InitVec = Builder.CreateSIToFP(InitVec, VecIVTy); auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep); auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul); State.set(Def, Add, Part); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, - Part); // It's useful to record the lane values too for the known minimum number // of elements so we do those below. This improves the code quality when // trying to extract the first element, for example. @@ -2728,14 +2670,12 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step, AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane)); // The step returned by `createStepForVF` is a runtime-evaluated value // when VF is scalable. Otherwise, it should be folded into a Constant. - assert((VF.isScalable() || isa<Constant>(StartIdx)) && + assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && "Expected StartIdx to be folded to a constant when VF is not " "scalable"); auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step); auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul); State.set(Def, Add, VPIteration(Part, Lane)); - recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State, - Part, Lane); } } } @@ -3023,21 +2963,19 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized // instruction could feed a poison value to the base address of the widen // load/store. - if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0) + if (State.MayGeneratePoisonRecipes.contains(RepRecipe)) Cloned->dropPoisonGeneratingFlags(); State.Builder.SetInsertPoint(Builder.GetInsertBlock(), Builder.GetInsertPoint()); // Replace the operands of the cloned instructions with their scalar // equivalents in the new loop. - for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) { - auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op)); + for (auto &I : enumerate(RepRecipe->operands())) { auto InputInstance = Instance; - if (!Operand || !OrigLoop->contains(Operand) || - (Cost->isUniformAfterVectorization(Operand, State.VF))) + VPValue *Operand = I.value(); + if (State.Plan->isUniformAfterVectorization(Operand)) InputInstance.Lane = VPLane::getFirstLane(); - auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance); - Cloned->setOperand(op, NewOp); + Cloned->setOperand(I.index(), State.get(Operand, InputInstance)); } addNewMetadata(Cloned, Instr); @@ -3339,7 +3277,7 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, Value *InnerLoopVectorizer::emitTransformedIndex( IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL, - const InductionDescriptor &ID) const { + const InductionDescriptor &ID, BasicBlock *VectorHeader) const { SCEVExpander Exp(*SE, DL, "induction"); auto Step = ID.getStep(); @@ -3382,15 +3320,15 @@ Value *InnerLoopVectorizer::emitTransformedIndex( }; // Get a suitable insert point for SCEV expansion. For blocks in the vector - // loop, choose the end of the vector loop header (=LoopVectorBody), because + // loop, choose the end of the vector loop header (=VectorHeader), because // the DomTree is not kept up-to-date for additional blocks generated in the // vector loop. By using the header as insertion point, we guarantee that the // expanded instructions dominate all their uses. - auto GetInsertPoint = [this, &B]() { + auto GetInsertPoint = [this, &B, VectorHeader]() { BasicBlock *InsertBB = B.GetInsertPoint()->getParent(); if (InsertBB != LoopVectorBody && - LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB)) - return LoopVectorBody->getTerminator(); + LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB)) + return VectorHeader->getTerminator(); return &*B.GetInsertPoint(); }; @@ -3538,7 +3476,8 @@ void InnerLoopVectorizer::createInductionResumeValues( CastInst::getCastOpcode(VectorTripCount, true, StepType, true); Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd"); const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); - EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + EndValue = + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); EndValue->setName("ind.end"); // Compute the end value for the additional bypass (if applicable). @@ -3549,7 +3488,7 @@ void InnerLoopVectorizer::createInductionResumeValues( CRD = B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd"); EndValueFromAdditionalBypass = - emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); + emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody); EndValueFromAdditionalBypass->setName("ind.end"); } } @@ -3623,7 +3562,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L, if (MDNode *LID = OrigLoop->getLoopID()) L->setLoopID(LID); - LoopVectorizeHints Hints(L, true, *ORE); + LoopVectorizeHints Hints(L, true, *ORE, TTI); Hints.setAlreadyVectorized(); #ifdef EXPENSIVE_CHECKS @@ -3780,7 +3719,8 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi, II.getStep()->getType()) : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType()); CMO->setName("cast.cmo"); - Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II); + Value *Escape = + emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody); Escape->setName("ind.escape"); MissingVals[UI] = Escape; } @@ -4573,7 +4513,8 @@ void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { } } -bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { +bool InnerLoopVectorizer::useOrderedReductions( + const RecurrenceDescriptor &RdxDesc) { return Cost->useOrderedReductions(RdxDesc); } @@ -4648,8 +4589,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Value *Idx = Builder.CreateAdd( PartStart, ConstantInt::get(PtrInd->getType(), Lane)); Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx); - Value *SclrGep = - emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II); + Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), + DL, II, State.CFG.PrevBB); SclrGep->setName("next.gep"); State.set(PhiR, SclrGep, VPIteration(Part, Lane)); } @@ -5368,13 +5309,9 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { // Limit MaxScalableVF by the maximum safe dependence distance. Optional<unsigned> MaxVScale = TTI.getMaxVScale(); - if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) { - unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange) - .getVScaleRangeArgs() - .second; - if (VScaleMax > 0) - MaxVScale = VScaleMax; - } + if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) + MaxVScale = + TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax(); MaxScalableVF = ElementCount::getScalable( MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0); if (!MaxScalableVF) @@ -5386,9 +5323,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) { return MaxScalableVF; } -FixedScalableVFPair -LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, - ElementCount UserVF) { +FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( + unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -5475,12 +5411,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount, FixedScalableVFPair Result(ElementCount::getFixed(1), ElementCount::getScalable(0)); - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, - WidestType, MaxSafeFixedVF)) + if (auto MaxVF = + getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, + MaxSafeFixedVF, FoldTailByMasking)) Result.FixedVF = MaxVF; - if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType, - WidestType, MaxSafeScalableVF)) + if (auto MaxVF = + getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType, + MaxSafeScalableVF, FoldTailByMasking)) if (MaxVF.isScalable()) { Result.ScalableVF = MaxVF; LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF @@ -5513,7 +5451,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { switch (ScalarEpilogueStatus) { case CM_ScalarEpilogueAllowed: - return computeFeasibleMaxVF(TC, UserVF); + return computeFeasibleMaxVF(TC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: LLVM_FALLTHROUGH; case CM_ScalarEpilogueNotNeededUsePredicate: @@ -5551,7 +5489,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, UserVF); + return computeFeasibleMaxVF(TC, UserVF, false); } return FixedScalableVFPair::getNone(); } @@ -5568,7 +5506,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); } - FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF); + FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true); // Avoid tail folding if the trip count is known to be a multiple of any VF // we chose. // FIXME: The condition below pessimises the case for fixed-width vectors, @@ -5641,7 +5579,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType, - const ElementCount &MaxSafeVF) { + const ElementCount &MaxSafeVF, bool FoldTailByMasking) { bool ComputeScalableMaxVF = MaxSafeVF.isScalable(); TypeSize WidestRegister = TTI.getRegisterBitWidth( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector @@ -5673,14 +5611,17 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( const auto TripCountEC = ElementCount::getFixed(ConstTripCount); if (ConstTripCount && ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) && - isPowerOf2_32(ConstTripCount)) { - // We need to clamp the VF to be the ConstTripCount. There is no point in - // choosing a higher viable VF as done in the loop below. If - // MaxVectorElementCount is scalable, we only fall back on a fixed VF when - // the TC is less than or equal to the known number of lanes. - LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: " - << ConstTripCount << "\n"); - return TripCountEC; + (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) { + // If loop trip count (TC) is known at compile time there is no point in + // choosing VF greater than TC (as done in the loop below). Select maximum + // power of two which doesn't exceed TC. + // If MaxVectorElementCount is scalable, we only fall back on a fixed VF + // when the TC is less than or equal to the known number of lanes. + auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount); + LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not " + "exceeding the constant trip count: " + << ClampedConstTripCount << "\n"); + return ElementCount::getFixed(ClampedConstTripCount); } ElementCount MaxVF = MaxVectorElementCount; @@ -5758,12 +5699,11 @@ bool LoopVectorizationCostModel::isMoreProfitable( EstimatedWidthB *= VScale.getValue(); } - // When set to preferred, for now assume vscale may be larger than 1 (or the - // one being tuned for), so that scalable vectorization is slightly favorable - // over fixed-width vectorization. - if (Hints->isScalableVectorizationPreferred()) - if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); + // Assume vscale may be larger than 1 (or the value being tuned for), + // so that scalable vectorization is slightly favorable over fixed-width + // vectorization. + if (A.Width.isScalable() && !B.Width.isScalable()) + return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) @@ -6068,7 +6008,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { if (auto *PN = dyn_cast<PHINode>(&I)) { if (!Legal->isReductionVariable(PN)) continue; - const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(PN)->second; if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || TTI.preferInLoopReduction(RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), @@ -7002,7 +6943,7 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( ReductionPhi = InLoopReductionImmediateChains[ReductionPhi]; const RecurrenceDescriptor &RdxDesc = - Legal->getReductionVars()[cast<PHINode>(ReductionPhi)]; + Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second; InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind); @@ -7079,22 +7020,41 @@ Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost( match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) { if (match(Op0, m_ZExtOrSExt(m_Value())) && Op0->getOpcode() == Op1->getOpcode() && - Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() && !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) { bool IsUnsigned = isa<ZExtInst>(Op0); - auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy); - // Matched reduce(mul(ext, ext)) - InstructionCost ExtCost = - TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType, - TTI::CastContextHint::None, CostKind, Op0); + Type *Op0Ty = Op0->getOperand(0)->getType(); + Type *Op1Ty = Op1->getOperand(0)->getType(); + Type *LargestOpTy = + Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty + : Op0Ty; + auto *ExtType = VectorType::get(LargestOpTy, VectorTy); + + // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of + // different sizes. We take the largest type as the ext to reduce, and add + // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). + InstructionCost ExtCost0 = TTI.getCastInstrCost( + Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy), + TTI::CastContextHint::None, CostKind, Op0); + InstructionCost ExtCost1 = TTI.getCastInstrCost( + Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy), + TTI::CastContextHint::None, CostKind, Op1); InstructionCost MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind); InstructionCost RedCost = TTI.getExtendedAddReductionCost( /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType, CostKind); + InstructionCost ExtraExtCost = 0; + if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) { + Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1; + ExtraExtCost = TTI.getCastInstrCost( + ExtraExtOp->getOpcode(), ExtType, + VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy), + TTI::CastContextHint::None, CostKind, ExtraExtOp); + } - if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost) + if (RedCost.isValid() && + (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost)) return I == RetI ? RedCost : 0; } else if (!match(I, m_ZExtOrSExt(m_Value()))) { // Matched reduce(mul()) @@ -7570,8 +7530,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, Type *CondTy = SI->getCondition()->getType(); if (!ScalarCond) CondTy = VectorType::get(CondTy, VF); - return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, - CmpInst::BAD_ICMP_PREDICATE, CostKind, I); + + CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition())) + Pred = Cmp->getPredicate(); + return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred, + CostKind, I); } case Instruction::ICmp: case Instruction::FCmp: { @@ -7581,7 +7545,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, - CmpInst::BAD_ICMP_PREDICATE, CostKind, I); + cast<CmpInst>(I)->getPredicate(), CostKind, + I); } case Instruction::Store: case Instruction::Load: { @@ -7762,14 +7727,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { // Ignore type-promoting instructions we identified during reduction // detection. for (auto &Reduction : Legal->getReductionVars()) { - RecurrenceDescriptor &RedDes = Reduction.second; + const RecurrenceDescriptor &RedDes = Reduction.second; const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } // Ignore type-casting instructions we identified during induction // detection. for (auto &Induction : Legal->getInductionVars()) { - InductionDescriptor &IndDes = Induction.second; + const InductionDescriptor &IndDes = Induction.second; const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); VecValuesToIgnore.insert(Casts.begin(), Casts.end()); } @@ -7778,7 +7743,7 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { void LoopVectorizationCostModel::collectInLoopReductions() { for (auto &Reduction : Legal->getReductionVars()) { PHINode *Phi = Reduction.first; - RecurrenceDescriptor &RdxDesc = Reduction.second; + const RecurrenceDescriptor &RdxDesc = Reduction.second; // We don't collect reductions that are type promoted (yet). if (RdxDesc.getRecurrenceType() != Phi->getType()) @@ -8064,18 +8029,6 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions( return U == Ind || DeadInstructions.count(cast<Instruction>(U)); })) DeadInstructions.insert(IndUpdate); - - // We record as "Dead" also the type-casting instructions we had identified - // during induction analysis. We don't need any handling for them in the - // vectorized loop because we have proven that, under a proper runtime - // test guarding the vectorized loop, the value of the phi, and the casted - // value of the phi, are the same. The last instruction in this casting chain - // will get its scalar/vector/widened def from the scalar/vector/widened def - // of the respective phi node. Any other casts in the induction def-use chain - // have no other uses outside the phi update chain, and will be ignored. - InductionDescriptor &IndDes = Induction.second; - const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts(); - DeadInstructions.insert(Casts.begin(), Casts.end()); } } @@ -8461,7 +8414,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, assert(EdgeMask && "No Edge Mask found for condition"); if (BI->getSuccessor(0) != Dst) - EdgeMask = Builder.createNot(EdgeMask); + EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc()); if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND. // The condition is 'SrcMask && EdgeMask', which is equivalent to @@ -8470,7 +8423,8 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, // EdgeMask is poison. Using 'and' here introduces undefined behavior. VPValue *False = Plan->getOrAddVPValue( ConstantInt::getFalse(BI->getCondition()->getType())); - EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False); + EdgeMask = + Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc()); } return EdgeMaskCache[Edge] = EdgeMask; @@ -8492,22 +8446,24 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - // Create the block in mask as the first non-phi instruction in the block. - VPBuilder::InsertPointGuard Guard(Builder); - auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); - Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); - // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. - // Start by constructing the desired canonical IV. + // Start by constructing the desired canonical IV in the header block. VPValue *IV = nullptr; if (Legal->getPrimaryInduction()) IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); else { + VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock(); auto *IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); + HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi()); IV = IVRecipe; } + + // Create the block in mask as the first non-phi instruction in the block. + VPBuilder::InsertPointGuard Guard(Builder); + auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); + Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint); + VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); @@ -8534,7 +8490,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { continue; } - BlockMask = Builder.createOr(BlockMask, EdgeMask); + BlockMask = Builder.createOr(BlockMask, EdgeMask, {}); } return BlockMaskCache[BB] = BlockMask; @@ -8591,14 +8547,10 @@ VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef<VPValue *> Operands) const { // Check if this is an integer or fp induction. If so, build the recipe that // produces its scalar and vector values. - InductionDescriptor II = Legal->getInductionVars().lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) { - assert(II.getStartValue() == + if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) { + assert(II->getStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); - const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts(); - return new VPWidenIntOrFpInductionRecipe( - Phi, Operands[0], Casts.empty() ? nullptr : Casts.front()); + return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II); } return nullptr; @@ -8624,11 +8576,10 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate( if (LoopVectorizationPlanner::getDecisionAndClampRange( isOptimizableIVTruncate(I), Range)) { - InductionDescriptor II = - Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0))); + auto *Phi = cast<PHINode>(I->getOperand(0)); + const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi); VPValue *Start = Plan.getOrAddVPValue(II.getStartValue()); - return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)), - Start, nullptr, I); + return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I); } return nullptr; } @@ -8844,13 +8795,17 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( return VPBB; } LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n"); - assert(VPBB->getSuccessors().empty() && - "VPBB has successors when handling predicated replication."); + + VPBlockBase *SingleSucc = VPBB->getSingleSuccessor(); + assert(SingleSucc && "VPBB must have a single successor when handling " + "predicated replication."); + VPBlockUtils::disconnectBlocks(VPBB, SingleSucc); // Record predicated instructions for above packing optimizations. VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan); VPBlockUtils::insertBlockAfter(Region, VPBB); auto *RegSucc = new VPBasicBlock(); VPBlockUtils::insertBlockAfter(RegSucc, Region); + VPBlockUtils::connectBlocks(RegSucc, SingleSucc); return RegSucc; } @@ -8910,7 +8865,8 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr, if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) { VPValue *StartV = Operands[0]; if (Legal->isReductionVariable(Phi)) { - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(Phi)->second; assert(RdxDesc.getRecurrenceStartValue() == Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, @@ -9031,7 +8987,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; - RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind(); + RecurKind Kind = + Legal->getReductionVars().find(Phi)->second.getRecurrenceKind(); const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; RecipeBuilder.recordRecipeOf(Phi); @@ -9069,30 +9026,25 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( // visit each basic block after having visited its predecessor basic blocks. // --------------------------------------------------------------------------- - auto Plan = std::make_unique<VPlan>(); + // Create initial VPlan skeleton, with separate header and latch blocks. + VPBasicBlock *HeaderVPBB = new VPBasicBlock(); + VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); + VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB); + auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop"); + auto Plan = std::make_unique<VPlan>(TopRegion); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. LoopBlocksDFS DFS(OrigLoop); DFS.perform(LI); - VPBasicBlock *VPBB = nullptr; - VPBasicBlock *HeaderVPBB = nullptr; + VPBasicBlock *VPBB = HeaderVPBB; SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove; for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; - auto *FirstVPBBForBB = new VPBasicBlock(BB->getName()); - if (VPBB) - VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB); - else { - auto *TopRegion = new VPRegionBlock("vector loop"); - TopRegion->setEntry(FirstVPBBForBB); - Plan->setEntry(TopRegion); - HeaderVPBB = FirstVPBBForBB; - } - VPBB = FirstVPBBForBB; + VPBB->setName(BB->getName()); Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. @@ -9159,13 +9111,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( : ""); } } + + VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); + VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor()); } + // Fold the last, empty block into its predecessor. + VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB); + assert(VPBB && "expected to fold last (empty) block"); + // After here, VPBB should not be used. + VPBB = nullptr; + assert(isa<VPRegionBlock>(Plan->getEntry()) && !Plan->getEntry()->getEntryBasicBlock()->empty() && "entry block must be set to a VPRegionBlock having a non-empty entry " "VPBasicBlock"); - cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB); RecipeBuilder.fixHeaderPhis(); // --------------------------------------------------------------------------- @@ -9231,18 +9191,19 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock); VPBlockUtils::connectBlocks(SplitPred, SinkRegion); VPBlockUtils::connectBlocks(SinkRegion, SplitBlock); - if (VPBB == SplitPred) - VPBB = SplitBlock; } } + VPlanTransforms::removeRedundantInductionCasts(*Plan); + // Now that sink-after is done, move induction recipes for optimized truncates // to the phi section of the header block. for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove) Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start); + adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan, + RecipeBuilder, Range.Start); // Introduce a recipe to combine the incoming and previous values of a // first-order recurrence. @@ -9322,6 +9283,11 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( RSO.flush(); Plan->setName(PlanName); + // Fold Exit block into its predecessor if possible. + // TODO: Fold block earlier once all VPlan transforms properly maintain a + // VPBasicBlock as exit. + VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit()); + assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid"); return Plan; } @@ -9355,9 +9321,10 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { } SmallPtrSet<Instruction *, 1> DeadInstructions; - VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan, - Legal->getInductionVars(), - DeadInstructions, *PSE.getSE()); + VPlanTransforms::VPInstructionsToVPRecipes( + OrigLoop, Plan, + [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); }, + DeadInstructions, *PSE.getSE()); return Plan; } @@ -9371,7 +9338,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( ElementCount MinVF) { for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; - RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; + const RecurrenceDescriptor &RdxDesc = + Legal->getReductionVars().find(Phi)->second; const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second; if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) @@ -9565,7 +9533,7 @@ void VPWidenRecipe::execute(VPTransformState &State) { // exact, etc.). The control flow has been linearized and the // instruction is no longer guarded by the predicate, which could make // the flag properties to no longer hold. - if (State.MayGeneratePoisonRecipes.count(this) > 0) + if (State.MayGeneratePoisonRecipes.contains(this)) VecOp->dropPoisonGeneratingFlags(); } @@ -9714,9 +9682,9 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) { void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); - State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(), - getTruncInst(), getVPValue(0), - getCastValue(), State); + State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(), + getStartValue()->getLiveInIRValue(), + getTruncInst(), getVPValue(0), State); } void VPWidenPHIRecipe::execute(VPTransformState &State) { @@ -10293,7 +10261,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { << L->getHeader()->getParent()->getName() << "\" from " << DebugLocStr << "\n"); - LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE); + LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI); LLVM_DEBUG( dbgs() << "LV: Loop hints:" @@ -10747,8 +10715,17 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, PA.preserve<LoopAnalysis>(); PA.preserve<DominatorTreeAnalysis>(); } - if (!Result.MadeCFGChange) + + if (Result.MadeCFGChange) { + // Making CFG changes likely means a loop got vectorized. Indicate that + // extra simplification passes should be run. + // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only + // be run if runtime checks have been added. + AM.getResult<ShouldRunExtraVectorPasses>(F); + PA.preserve<ShouldRunExtraVectorPasses>(); + } else { PA.preserveSet<CFGAnalyses>(); + } return PA; } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 95061e9053fa..37ae13666f7a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -631,27 +631,26 @@ static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { /// after: 6 3 5 4 7 2 1 0 static void fixupOrderingIndices(SmallVectorImpl<unsigned> &Order) { const unsigned Sz = Order.size(); - SmallBitVector UsedIndices(Sz); - SmallVector<int> MaskedIndices; + SmallBitVector UnusedIndices(Sz, /*t=*/true); + SmallBitVector MaskedIndices(Sz); for (unsigned I = 0; I < Sz; ++I) { if (Order[I] < Sz) - UsedIndices.set(Order[I]); + UnusedIndices.reset(Order[I]); else - MaskedIndices.push_back(I); + MaskedIndices.set(I); } - if (MaskedIndices.empty()) + if (MaskedIndices.none()) return; - SmallVector<int> AvailableIndices(MaskedIndices.size()); - unsigned Cnt = 0; - int Idx = UsedIndices.find_first(); - do { - AvailableIndices[Cnt] = Idx; - Idx = UsedIndices.find_next(Idx); - ++Cnt; - } while (Idx > 0); - assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); - for (int I = 0, E = MaskedIndices.size(); I < E; ++I) - Order[MaskedIndices[I]] = AvailableIndices[I]; + assert(UnusedIndices.count() == MaskedIndices.count() && + "Non-synced masked/available indices."); + int Idx = UnusedIndices.find_first(); + int MIdx = MaskedIndices.find_first(); + while (MIdx >= 0) { + assert(Idx >= 0 && "Indices must be synced."); + Order[MIdx] = Idx; + Idx = UnusedIndices.find_next(Idx); + MIdx = MaskedIndices.find_next(MIdx); + } } namespace llvm { @@ -812,6 +811,13 @@ public: /// ExtractElement, ExtractValue), which can be part of the graph. Optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE); + /// Gets reordering data for the given tree entry. If the entry is vectorized + /// - just return ReorderIndices, otherwise check if the scalars can be + /// reordered and return the most optimal order. + /// \param TopToBottom If true, include the order of vectorized stores and + /// insertelement nodes, otherwise skip them. + Optional<OrdersType> getReorderingData(const TreeEntry &TE, bool TopToBottom); + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -1010,18 +1016,25 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } - // The hard-coded scores listed here are not very important. When computing - // the scores of matching one sub-tree with another, we are basically - // counting the number of values that are matching. So even if all scores - // are set to 1, we would still get a decent matching result. + // The hard-coded scores listed here are not very important, though it shall + // be higher for better matches to improve the resulting cost. When + // computing the scores of matching one sub-tree with another, we are + // basically counting the number of values that are matching. So even if all + // scores are set to 1, we would still get a decent matching result. // However, sometimes we have to break ties. For example we may have to // choose between matching loads vs matching opcodes. This is what these - // scores are helping us with: they provide the order of preference. + // scores are helping us with: they provide the order of preference. Also, + // this is important if the scalar is externally used or used in another + // tree entry node in the different lane. /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). - static const int ScoreConsecutiveLoads = 3; + static const int ScoreConsecutiveLoads = 4; + /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). + static const int ScoreReversedLoads = 3; /// ExtractElementInst from same vector and consecutive indexes. - static const int ScoreConsecutiveExtracts = 3; + static const int ScoreConsecutiveExtracts = 4; + /// ExtractElementInst from same vector and reversed indices. + static const int ScoreReversedExtracts = 3; /// Constants. static const int ScoreConstants = 2; /// Instructions with the same opcode. @@ -1041,7 +1054,10 @@ public: /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, - ScalarEvolution &SE) { + ScalarEvolution &SE, int NumLanes) { + if (V1 == V2) + return VLOperands::ScoreSplat; + auto *LI1 = dyn_cast<LoadInst>(V1); auto *LI2 = dyn_cast<LoadInst>(V2); if (LI1 && LI2) { @@ -1051,8 +1067,17 @@ public: Optional<int> Dist = getPointersDiff( LI1->getType(), LI1->getPointerOperand(), LI2->getType(), LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); - return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreFail; + if (!Dist) + return VLOperands::ScoreFail; + // The distance is too large - still may be profitable to use masked + // loads/gathers. + if (std::abs(*Dist) > NumLanes / 2) + return VLOperands::ScoreAltOpcodes; + // This still will detect consecutive loads, but we might have "holes" + // in some cases. It is ok for non-power-2 vectorization and may produce + // better results. It should not affect current vectorization. + return (*Dist > 0) ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreReversedLoads; } auto *C1 = dyn_cast<Constant>(V1); @@ -1062,18 +1087,41 @@ public: // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. - Value *EV; - ConstantInt *Ex1Idx, *Ex2Idx; - if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) && - match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) && - Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue()) - return VLOperands::ScoreConsecutiveExtracts; + Value *EV1; + ConstantInt *Ex1Idx; + if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) { + // Undefs are always profitable for extractelements. + if (isa<UndefValue>(V2)) + return VLOperands::ScoreConsecutiveExtracts; + Value *EV2 = nullptr; + ConstantInt *Ex2Idx = nullptr; + if (match(V2, + m_ExtractElt(m_Value(EV2), m_CombineOr(m_ConstantInt(Ex2Idx), + m_Undef())))) { + // Undefs are always profitable for extractelements. + if (!Ex2Idx) + return VLOperands::ScoreConsecutiveExtracts; + if (isUndefVector(EV2) && EV2->getType() == EV1->getType()) + return VLOperands::ScoreConsecutiveExtracts; + if (EV2 == EV1) { + int Idx1 = Ex1Idx->getZExtValue(); + int Idx2 = Ex2Idx->getZExtValue(); + int Dist = Idx2 - Idx1; + // The distance is too large - still may be profitable to use + // shuffles. + if (std::abs(Dist) > NumLanes / 2) + return VLOperands::ScoreAltOpcodes; + return (Dist > 0) ? VLOperands::ScoreConsecutiveExtracts + : VLOperands::ScoreReversedExtracts; + } + } + } auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (I1 && I2) { - if (I1 == I2) - return VLOperands::ScoreSplat; + if (I1->getParent() != I2->getParent()) + return VLOperands::ScoreFail; InstructionsState S = getSameOpcode({I1, I2}); // Note: Only consider instructions with <= 2 operands to avoid // complexity explosion. @@ -1088,11 +1136,13 @@ public: return VLOperands::ScoreFail; } - /// Holds the values and their lane that are taking part in the look-ahead + /// Holds the values and their lanes that are taking part in the look-ahead /// score calculation. This is used in the external uses cost calculation. - SmallDenseMap<Value *, int> InLookAheadValues; + /// Need to hold all the lanes in case of splat/broadcast at least to + /// correctly check for the use in the different lane. + SmallDenseMap<Value *, SmallSet<int, 4>> InLookAheadValues; - /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// \returns the additional cost due to uses of \p LHS and \p RHS that are /// either external to the vectorized code, or require shuffling. int getExternalUsesCost(const std::pair<Value *, int> &LHS, const std::pair<Value *, int> &RHS) { @@ -1116,22 +1166,30 @@ public: for (User *U : V->users()) { if (const TreeEntry *UserTE = R.getTreeEntry(U)) { // The user is in the VectorizableTree. Check if we need to insert. - auto It = llvm::find(UserTE->Scalars, U); - assert(It != UserTE->Scalars.end() && "U is in UserTE"); - int UserLn = std::distance(UserTE->Scalars.begin(), It); + int UserLn = UserTE->findLaneForValue(U); assert(UserLn >= 0 && "Bad lane"); - if (UserLn != Ln) + // If the values are different, check just the line of the current + // value. If the values are the same, need to add UserInDiffLaneCost + // only if UserLn does not match both line numbers. + if ((LHS.first != RHS.first && UserLn != Ln) || + (LHS.first == RHS.first && UserLn != LHS.second && + UserLn != RHS.second)) { Cost += UserInDiffLaneCost; + break; + } } else { // Check if the user is in the look-ahead code. auto It2 = InLookAheadValues.find(U); if (It2 != InLookAheadValues.end()) { // The user is in the look-ahead code. Check the lane. - if (It2->second != Ln) + if (!It2->getSecond().contains(Ln)) { Cost += UserInDiffLaneCost; + break; + } } else { // The user is neither in SLP tree nor in the look-ahead code. Cost += ExternalUseCost; + break; } } // Limit the number of visited uses to cap compilation time. @@ -1170,32 +1228,36 @@ public: Value *V1 = LHS.first; Value *V2 = RHS.first; // Get the shallow score of V1 and V2. - int ShallowScoreAtThisLevel = - std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - - getExternalUsesCost(LHS, RHS)); + int ShallowScoreAtThisLevel = std::max( + (int)ScoreFail, getShallowScore(V1, V2, DL, SE, getNumLanes()) - + getExternalUsesCost(LHS, RHS)); int Lane1 = LHS.second; int Lane2 = RHS.second; // If reached MaxLevel, // or if V1 and V2 are not instructions, // or if they are SPLAT, - // or if they are not consecutive, early return the current cost. + // or if they are not consecutive, + // or if profitable to vectorize loads or extractelements, early return + // the current cost. auto *I1 = dyn_cast<Instruction>(V1); auto *I2 = dyn_cast<Instruction>(V2); if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || ShallowScoreAtThisLevel == VLOperands::ScoreFail || - (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) + (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) || + (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) && + ShallowScoreAtThisLevel)) return ShallowScoreAtThisLevel; assert(I1 && I2 && "Should have early exited."); // Keep track of in-tree values for determining the external-use cost. - InLookAheadValues[V1] = Lane1; - InLookAheadValues[V2] = Lane2; + InLookAheadValues[V1].insert(Lane1); + InLookAheadValues[V2].insert(Lane2); // Contains the I2 operand indexes that got matched with I1 operands. SmallSet<unsigned, 4> Op2Used; - // Recursion towards the operands of I1 and I2. We are trying all possbile + // Recursion towards the operands of I1 and I2. We are trying all possible // operand pairs, and keeping track of the best score. for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); OpIdx1 != NumOperands1; ++OpIdx1) { @@ -1319,27 +1381,79 @@ public: return None; } - /// Helper for reorderOperandVecs. \Returns the lane that we should start - /// reordering from. This is the one which has the least number of operands - /// that can freely move about. + /// Helper for reorderOperandVecs. + /// \returns the lane that we should start reordering from. This is the one + /// which has the least number of operands that can freely move about or + /// less profitable because it already has the most optimal set of operands. unsigned getBestLaneToStartReordering() const { - unsigned BestLane = 0; unsigned Min = UINT_MAX; - for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; - ++Lane) { - unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane); - if (NumFreeOps < Min) { - Min = NumFreeOps; - BestLane = Lane; + unsigned SameOpNumber = 0; + // std::pair<unsigned, unsigned> is used to implement a simple voting + // algorithm and choose the lane with the least number of operands that + // can freely move about or less profitable because it already has the + // most optimal set of operands. The first unsigned is a counter for + // voting, the second unsigned is the counter of lanes with instructions + // with same/alternate opcodes and same parent basic block. + MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; + // Try to be closer to the original results, if we have multiple lanes + // with same cost. If 2 lanes have the same cost, use the one with the + // lowest index. + for (int I = getNumLanes(); I > 0; --I) { + unsigned Lane = I - 1; + OperandsOrderData NumFreeOpsHash = + getMaxNumOperandsThatCanBeReordered(Lane); + // Compare the number of operands that can move and choose the one with + // the least number. + if (NumFreeOpsHash.NumOfAPOs < Min) { + Min = NumFreeOpsHash.NumOfAPOs; + SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; + HashMap.clear(); + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + } else if (NumFreeOpsHash.NumOfAPOs == Min && + NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { + // Select the most optimal lane in terms of number of operands that + // should be moved around. + SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; + HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane); + } else if (NumFreeOpsHash.NumOfAPOs == Min && + NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { + ++HashMap[NumFreeOpsHash.Hash].first; + } + } + // Select the lane with the minimum counter. + unsigned BestLane = 0; + unsigned CntMin = UINT_MAX; + for (const auto &Data : reverse(HashMap)) { + if (Data.second.first < CntMin) { + CntMin = Data.second.first; + BestLane = Data.second.second; } } return BestLane; } - /// \Returns the maximum number of operands that are allowed to be reordered - /// for \p Lane. This is used as a heuristic for selecting the first lane to - /// start operand reordering. - unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { + /// Data structure that helps to reorder operands. + struct OperandsOrderData { + /// The best number of operands with the same APOs, which can be + /// reordered. + unsigned NumOfAPOs = UINT_MAX; + /// Number of operands with the same/alternate instruction opcode and + /// parent. + unsigned NumOpsWithSameOpcodeParent = 0; + /// Hash for the actual operands ordering. + /// Used to count operands, actually their position id and opcode + /// value. It is used in the voting mechanism to find the lane with the + /// least number of operands that can freely move about or less profitable + /// because it already has the most optimal set of operands. Can be + /// replaced with SmallVector<unsigned> instead but hash code is faster + /// and requires less memory. + unsigned Hash = 0; + }; + /// \returns the maximum number of operands that are allowed to be reordered + /// for \p Lane and the number of compatible instructions(with the same + /// parent/opcode). This is used as a heuristic for selecting the first lane + /// to start operand reordering. + OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { unsigned CntTrue = 0; unsigned NumOperands = getNumOperands(); // Operands with the same APO can be reordered. We therefore need to count @@ -1348,11 +1462,45 @@ public: // a map. Instead we can simply count the number of operands that // correspond to one of them (in this case the 'true' APO), and calculate // the other by subtracting it from the total number of operands. - for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) - if (getData(OpIdx, Lane).APO) + // Operands with the same instruction opcode and parent are more + // profitable since we don't need to move them in many cases, with a high + // probability such lane already can be vectorized effectively. + bool AllUndefs = true; + unsigned NumOpsWithSameOpcodeParent = 0; + Instruction *OpcodeI = nullptr; + BasicBlock *Parent = nullptr; + unsigned Hash = 0; + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + const OperandData &OpData = getData(OpIdx, Lane); + if (OpData.APO) ++CntTrue; - unsigned CntFalse = NumOperands - CntTrue; - return std::max(CntTrue, CntFalse); + // Use Boyer-Moore majority voting for finding the majority opcode and + // the number of times it occurs. + if (auto *I = dyn_cast<Instruction>(OpData.V)) { + if (!OpcodeI || !getSameOpcode({OpcodeI, I}).getOpcode() || + I->getParent() != Parent) { + if (NumOpsWithSameOpcodeParent == 0) { + NumOpsWithSameOpcodeParent = 1; + OpcodeI = I; + Parent = I->getParent(); + } else { + --NumOpsWithSameOpcodeParent; + } + } else { + ++NumOpsWithSameOpcodeParent; + } + } + Hash = hash_combine( + Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1))); + AllUndefs = AllUndefs && isa<UndefValue>(OpData.V); + } + if (AllUndefs) + return {}; + OperandsOrderData Data; + Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue); + Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; + Data.Hash = Hash; + return Data; } /// Go through the instructions in VL and append their operands. @@ -1500,11 +1648,37 @@ public: ReorderingModes[OpIdx] = ReorderingMode::Failed; } + // Check that we don't have same operands. No need to reorder if operands + // are just perfect diamond or shuffled diamond match. Do not do it only + // for possible broadcasts or non-power of 2 number of scalars (just for + // now). + auto &&SkipReordering = [this]() { + SmallPtrSet<Value *, 4> UniqueValues; + ArrayRef<OperandData> Op0 = OpsVec.front(); + for (const OperandData &Data : Op0) + UniqueValues.insert(Data.V); + for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) { + if (any_of(Op, [&UniqueValues](const OperandData &Data) { + return !UniqueValues.contains(Data.V); + })) + return false; + } + // TODO: Check if we can remove a check for non-power-2 number of + // scalars after full support of non-power-2 vectorization. + return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size()); + }; + // If the initial strategy fails for any of the operand indexes, then we // perform reordering again in a second pass. This helps avoid assigning // high priority to the failed strategy, and should improve reordering for // the non-failed operand indexes. for (int Pass = 0; Pass != 2; ++Pass) { + // Check if no need to reorder operands since they're are perfect or + // shuffled diamond match. + // Need to to do it to avoid extra external use cost counting for + // shuffled matches, which may cause regressions. + if (SkipReordering()) + break; // Skip the second pass if the first pass did not fail. bool StrategyFailed = false; // Mark all operand data as free to use. @@ -1792,9 +1966,10 @@ private: if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].empty() && "Already resized?"); - Operands[OpIdx].resize(Scalars.size()); - for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) - Operands[OpIdx][Lane] = OpVL[Lane]; + assert(OpVL.size() <= Scalars.size() && + "Number of operands is greater than the number of scalars."); + Operands[OpIdx].resize(OpVL.size()); + copy(OpVL, Operands[OpIdx].begin()); } /// Set the operands of this bundle in their original order. @@ -1944,7 +2119,7 @@ private: if (ReuseShuffleIndices.empty()) dbgs() << "Empty"; else - for (unsigned ReuseIdx : ReuseShuffleIndices) + for (int ReuseIdx : ReuseShuffleIndices) dbgs() << ReuseIdx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; @@ -2819,6 +2994,50 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { return None; } +Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE, + bool TopToBottom) { + // No need to reorder if need to shuffle reuses, still need to shuffle the + // node. + if (!TE.ReuseShuffleIndices.empty()) + return None; + if (TE.State == TreeEntry::Vectorize && + (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) || + (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) && + !TE.isAltShuffle()) + return TE.ReorderIndices; + if (TE.State == TreeEntry::NeedToGather) { + // TODO: add analysis of other gather nodes with extractelement + // instructions and other values/instructions, not only undefs. + if (((TE.getOpcode() == Instruction::ExtractElement && + !TE.isAltShuffle()) || + (all_of(TE.Scalars, + [](Value *V) { + return isa<UndefValue, ExtractElementInst>(V); + }) && + any_of(TE.Scalars, + [](Value *V) { return isa<ExtractElementInst>(V); }))) && + all_of(TE.Scalars, + [](Value *V) { + auto *EE = dyn_cast<ExtractElementInst>(V); + return !EE || isa<FixedVectorType>(EE->getVectorOperandType()); + }) && + allSameType(TE.Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + if (!CurrentOrder.empty()) + fixupOrderingIndices(CurrentOrder); + return CurrentOrder; + } + } + if (Optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE)) + return CurrentOrder; + } + return None; +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries; @@ -2826,42 +3045,15 @@ void BoUpSLP::reorderTopToBottom() { // their ordering. DenseMap<const TreeEntry *, OrdersType> GathersToOrders; // Find all reorderable nodes with the given VF. - // Currently the are vectorized loads,extracts + some gathering of extracts. + // Currently the are vectorized stores,loads,extracts + some gathering of + // extracts. for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders]( const std::unique_ptr<TreeEntry> &TE) { - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa<LoadInst, ExtractElementInst, ExtractValueInst, StoreInst, - InsertElementInst>(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional<OrdersType> CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/true)) { VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if (Reuse || !CurrentOrder.empty()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional<OrdersType> CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -2993,44 +3185,11 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { const std::unique_ptr<TreeEntry> &TE) { if (TE->State != TreeEntry::Vectorize) NonVectorized.push_back(TE.get()); - // No need to reorder if need to shuffle reuses, still need to shuffle the - // node. - if (!TE->ReuseShuffleIndices.empty()) - return; - if (TE->State == TreeEntry::Vectorize && - isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE->getMainOp()) && - !TE->isAltShuffle()) { + if (Optional<OrdersType> CurrentOrder = + getReorderingData(*TE.get(), /*TopToBottom=*/false)) { OrderedEntries.insert(TE.get()); - return; - } - if (TE->State == TreeEntry::NeedToGather) { - if (TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa<FixedVectorType>(cast<ExtractElementInst>(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector with a single user only. - OrdersType CurrentOrder; - bool Reuse = - canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if ((Reuse || !CurrentOrder.empty()) && - !any_of(VectorizableTree, - [&TE](const std::unique_ptr<TreeEntry> &Entry) { - return Entry->State == TreeEntry::NeedToGather && - Entry.get() != TE.get() && - Entry->isSame(TE->Scalars); - })) { - OrderedEntries.insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); - return; - } - } - if (Optional<OrdersType> CurrentOrder = - findReusedOrderedScalars(*TE.get())) { - OrderedEntries.insert(TE.get()); + if (TE->State != TreeEntry::Vectorize) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); - } } }); @@ -3392,9 +3551,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Check that every instruction appears once in this bundle. DenseMap<Value *, unsigned> UniquePositions; for (Value *V : VL) { + if (isConstant(V)) { + ReuseShuffleIndicies.emplace_back( + isa<UndefValue>(V) ? UndefMaskElem : UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndicies.emplace_back(isa<UndefValue>(V) ? -1 - : Res.first->second); + ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second) UniqueValues.emplace_back(V); } @@ -3404,6 +3568,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (NumUniqueScalarValues <= 1 || + (UniquePositions.size() == 1 && all_of(UniqueValues, + [](Value *V) { + return isa<UndefValue>(V) || + !isConstant(V); + })) || !llvm::isPowerOf2_32(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -3508,11 +3677,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } } - // If any of the scalars is marked as a value that needs to stay scalar, then - // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (Value *V : VL) { - if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { + if (is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, @@ -4219,10 +4386,17 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, SmallVectorImpl<unsigned> &CurrentOrder) const { - Instruction *E0 = cast<Instruction>(OpValue); - assert(E0->getOpcode() == Instruction::ExtractElement || - E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + const auto *It = find_if(VL, [](Value *V) { + return isa<ExtractElementInst, ExtractValueInst>(V); + }); + assert(It != VL.end() && "Expected at least one extract instruction."); + auto *E0 = cast<Instruction>(*It); + assert(all_of(VL, + [](Value *V) { + return isa<UndefValue, ExtractElementInst, ExtractValueInst>( + V); + }) && + "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -4255,23 +4429,28 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, // Also, later we can check that all the indices are used and we have a // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. - CurrentOrder.assign(E, E + 1); + CurrentOrder.assign(E, E); unsigned I = 0; for (; I < E; ++I) { - auto *Inst = cast<Instruction>(VL[I]); + auto *Inst = dyn_cast<Instruction>(VL[I]); + if (!Inst) + continue; if (Inst->getOperand(0) != Vec) break; + if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) + if (isa<UndefValue>(EE->getIndexOperand())) + continue; Optional<unsigned> Idx = getExtractIndex(Inst); if (!Idx) break; const unsigned ExtIdx = *Idx; if (ExtIdx != I) { - if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) + if (ExtIdx >= E || CurrentOrder[ExtIdx] != E) break; ShouldKeepOrder = false; CurrentOrder[ExtIdx] = I; } else { - if (CurrentOrder[I] != E + 1) + if (CurrentOrder[I] != E) break; CurrentOrder[I] = I; } @@ -4287,8 +4466,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, bool BoUpSLP::areAllUsersVectorized(Instruction *I, ArrayRef<Value *> VectorizedVals) const { return (I->hasOneUse() && is_contained(VectorizedVals, I)) || - llvm::all_of(I->users(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + all_of(I->users(), [this](User *U) { + return ScalarToTreeEntry.count(U) > 0 || MustGather.contains(U); }); } @@ -4348,6 +4527,10 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, for (auto *V : VL) { ++Idx; + // Need to exclude undefs from analysis. + if (isa<UndefValue>(V) || Mask[Idx] == UndefMaskElem) + continue; + // Reached the start of a new vector registers. if (Idx % EltsPerVector == 0) { AllConsecutive = true; @@ -4357,9 +4540,11 @@ computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, // Check all extracts for a vector register on the target directly // extract values in order. unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); - unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); - AllConsecutive &= PrevIdx + 1 == CurrentIdx && - CurrentIdx % EltsPerVector == Idx % EltsPerVector; + if (!isa<UndefValue>(VL[Idx - 1]) && Mask[Idx - 1] != UndefMaskElem) { + unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); + AllConsecutive &= PrevIdx + 1 == CurrentIdx && + CurrentIdx % EltsPerVector == Idx % EltsPerVector; + } if (AllConsecutive) continue; @@ -4442,9 +4627,9 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // FIXME: it tries to fix a problem with MSVC buildbots. TargetTransformInfo &TTIRef = *TTI; auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, - VectorizedVals](InstructionCost &Cost, - bool IsGather) { + VectorizedVals, E](InstructionCost &Cost) { DenseMap<Value *, int> ExtractVectorsTys; + SmallPtrSet<Value *, 4> CheckedExtracts; for (auto *V : VL) { if (isa<UndefValue>(V)) continue; @@ -4452,7 +4637,12 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals)) + // Also, avoid adjusting the cost for extractelements with multiple uses + // in different graph entries. + const TreeEntry *VE = getTreeEntry(V); + if (!CheckedExtracts.insert(V).second || + !areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || + (VE && VE != E)) continue; auto *EE = cast<ExtractElementInst>(V); Optional<unsigned> EEIdx = getExtractIndex(EE); @@ -4549,11 +4739,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, } return GatherCost; } - if (isSplat(VL)) { - // Found the broadcasting of the single scalar, calculate the cost as the - // broadcast. - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); - } if ((E->getOpcode() == Instruction::ExtractElement || all_of(E->Scalars, [](Value *V) { @@ -4571,13 +4756,20 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, // single input vector or of 2 input vectors. InstructionCost Cost = computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); - AdjustExtractsCost(Cost, /*IsGather=*/true); + AdjustExtractsCost(Cost); if (NeedToShuffleReuses) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); return Cost; } } + if (isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as the + // broadcast. + assert(VecTy == FinalVecTy && + "No reused scalars expected for broadcast."); + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); + } InstructionCost ReuseShuffleCost = 0; if (NeedToShuffleReuses) ReuseShuffleCost = TTI->getShuffleCost( @@ -4755,7 +4947,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } else { - AdjustExtractsCost(CommonCost, /*IsGather=*/false); + AdjustExtractsCost(CommonCost); } return CommonCost; } @@ -5211,15 +5403,15 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, FoundOr = true; } // Check if the input is an extended load of the required or/shift expression. - Value *LoadPtr; + Value *Load; if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || - !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load)) return false; // Require that the total load bit width is a legal integer type. // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. - Type *SrcTy = LoadPtr->getType()->getPointerElementType(); + Type *SrcTy = Load->getType(); unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth))) return false; @@ -9061,8 +9253,7 @@ private: "A call to the llvm.fmuladd intrinsic is not handled yet"); ++NumVectorInstructions; - return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind, - ReductionOps.back()); + return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); } }; @@ -9473,6 +9664,59 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming, return Changed; } +/// Compare two cmp instructions. If IsCompatibility is true, function returns +/// true if 2 cmps have same/swapped predicates and mos compatible corresponding +/// operands. If IsCompatibility is false, function implements strict weak +/// ordering relation between two cmp instructions, returning true if the first +/// instruction is "less" than the second, i.e. its predicate is less than the +/// predicate of the second or the operands IDs are less than the operands IDs +/// of the second cmp instruction. +template <bool IsCompatibility> +static bool compareCmp(Value *V, Value *V2, + function_ref<bool(Instruction *)> IsDeleted) { + auto *CI1 = cast<CmpInst>(V); + auto *CI2 = cast<CmpInst>(V2); + if (IsDeleted(CI2) || !isValidElementType(CI2->getType())) + return false; + if (CI1->getOperand(0)->getType()->getTypeID() < + CI2->getOperand(0)->getType()->getTypeID()) + return !IsCompatibility; + if (CI1->getOperand(0)->getType()->getTypeID() > + CI2->getOperand(0)->getType()->getTypeID()) + return false; + CmpInst::Predicate Pred1 = CI1->getPredicate(); + CmpInst::Predicate Pred2 = CI2->getPredicate(); + CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(Pred1); + CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(Pred2); + CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1); + CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2); + if (BasePred1 < BasePred2) + return !IsCompatibility; + if (BasePred1 > BasePred2) + return false; + // Compare operands. + bool LEPreds = Pred1 <= Pred2; + bool GEPreds = Pred1 >= Pred2; + for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { + auto *Op1 = CI1->getOperand(LEPreds ? I : E - I - 1); + auto *Op2 = CI2->getOperand(GEPreds ? I : E - I - 1); + if (Op1->getValueID() < Op2->getValueID()) + return !IsCompatibility; + if (Op1->getValueID() > Op2->getValueID()) + return false; + if (auto *I1 = dyn_cast<Instruction>(Op1)) + if (auto *I2 = dyn_cast<Instruction>(Op2)) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return false; + } + } + return IsCompatibility; +} + bool SLPVectorizerPass::vectorizeSimpleInstructions( SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, bool AtTerminator) { @@ -9504,37 +9748,16 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( } // Try to vectorize list of compares. // Sort by type, compare predicate, etc. - // TODO: Add analysis on the operand opcodes (profitable to vectorize - // instructions with same/alternate opcodes/const values). auto &&CompareSorter = [&R](Value *V, Value *V2) { - auto *CI1 = cast<CmpInst>(V); - auto *CI2 = cast<CmpInst>(V2); - if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; - if (CI1->getOperand(0)->getType()->getTypeID() < - CI2->getOperand(0)->getType()->getTypeID()) - return true; - if (CI1->getOperand(0)->getType()->getTypeID() > - CI2->getOperand(0)->getType()->getTypeID()) - return false; - return CI1->getPredicate() < CI2->getPredicate() || - (CI1->getPredicate() > CI2->getPredicate() && - CI1->getPredicate() < - CmpInst::getSwappedPredicate(CI2->getPredicate())); + return compareCmp<false>(V, V2, + [&R](Instruction *I) { return R.isDeleted(I); }); }; auto &&AreCompatibleCompares = [&R](Value *V1, Value *V2) { if (V1 == V2) return true; - auto *CI1 = cast<CmpInst>(V1); - auto *CI2 = cast<CmpInst>(V2); - if (R.isDeleted(CI2) || !isValidElementType(CI2->getType())) - return false; - if (CI1->getOperand(0)->getType() != CI2->getOperand(0)->getType()) - return false; - return CI1->getPredicate() == CI2->getPredicate() || - CI1->getPredicate() == - CmpInst::getSwappedPredicate(CI2->getPredicate()); + return compareCmp<true>(V1, V2, + [&R](Instruction *I) { return R.isDeleted(I); }); }; auto Limit = [&R](Value *V) { unsigned EltSize = R.getVectorElementSize(V); @@ -9592,10 +9815,15 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return true; if (Opcodes1.size() > Opcodes2.size()) return false; + Optional<bool> ConstOrder; for (int I = 0, E = Opcodes1.size(); I < E; ++I) { // Undefs are compatible with any other value. - if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) + if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) { + if (!ConstOrder) + ConstOrder = + !isa<UndefValue>(Opcodes1[I]) && isa<UndefValue>(Opcodes2[I]); continue; + } if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); @@ -9614,14 +9842,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; return I1->getOpcode() < I2->getOpcode(); } - if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) { + if (!ConstOrder) + ConstOrder = Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID(); continue; + } if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) return true; if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) return false; } - return false; + return ConstOrder && *ConstOrder; }; auto AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { if (V1 == V2) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 44b5e1df0839..1d9e71663cd2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -374,8 +374,7 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { assert((SplitAt == end() || SplitAt->getParent() == this) && "can only split at a position in the same block"); - SmallVector<VPBlockBase *, 2> Succs(getSuccessors().begin(), - getSuccessors().end()); + SmallVector<VPBlockBase *, 2> Succs(successors()); // First, disconnect the current block from its successors. for (VPBlockBase *Succ : Succs) VPBlockUtils::disconnectBlocks(this, Succ); @@ -642,6 +641,7 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB, void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilder<> &Builder = State.Builder; + Builder.SetCurrentDebugLocation(DL); if (Instruction::isBinaryOp(getOpcode())) { Value *A = State.get(getOperand(0), Part); @@ -768,6 +768,11 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, O << " "; Operand->printAsOperand(O, SlotTracker); } + + if (DL) { + O << ", !dbg "; + DL.print(O); + } } #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 810dd5030f95..f4a1883e35d5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -39,6 +39,7 @@ #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/InstructionCost.h" #include <algorithm> @@ -51,6 +52,7 @@ namespace llvm { class BasicBlock; class DominatorTree; +class InductionDescriptor; class InnerLoopVectorizer; class LoopInfo; class raw_ostream; @@ -500,6 +502,8 @@ public: const VPBlocksTy &getSuccessors() const { return Successors; } VPBlocksTy &getSuccessors() { return Successors; } + iterator_range<VPBlockBase **> successors() { return Successors; } + const VPBlocksTy &getPredecessors() const { return Predecessors; } VPBlocksTy &getPredecessors() { return Predecessors; } @@ -795,6 +799,7 @@ private: typedef unsigned char OpcodeTy; OpcodeTy Opcode; FastMathFlags FMF; + DebugLoc DL; /// Utility method serving execute(): generates a single instance of the /// modeled instruction. @@ -804,12 +809,14 @@ protected: void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } public: - VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands) + VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL) : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), - VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} + VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode), + DL(DL) {} - VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) - : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {} + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, + DebugLoc DL = {}) + : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL) {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPValue *V) { @@ -818,7 +825,7 @@ public: VPInstruction *clone() const { SmallVector<VPValue *, 2> Operands(operands()); - return new VPInstruction(Opcode, Operands); + return new VPInstruction(Opcode, Operands, DL); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1003,21 +1010,22 @@ public: /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { PHINode *IV; + const InductionDescriptor &IndDesc; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast, - TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) { - if (Trunc) - new VPValue(Trunc, this); - else - new VPValue(IV, this); + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + const InductionDescriptor &IndDesc) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this), + IV(IV), IndDesc(IndDesc) {} + + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + const InductionDescriptor &IndDesc, + TruncInst *Trunc) + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this), + IV(IV), IndDesc(IndDesc) {} - if (Cast) - new VPValue(Cast, this); - } ~VPWidenIntOrFpInductionRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1038,13 +1046,6 @@ public: /// Returns the start value of the induction. VPValue *getStartValue() { return getOperand(0); } - /// Returns the cast VPValue, if one is attached, or nullptr otherwise. - VPValue *getCastValue() { - if (getNumDefinedValues() != 2) - return nullptr; - return getVPValue(1); - } - /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { @@ -1053,6 +1054,9 @@ public: const TruncInst *getTruncInst() const { return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue()); } + + /// Returns the induction descriptor for the recipe. + const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } }; /// A recipe for handling first order recurrences and pointer inductions. For @@ -1169,7 +1173,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe { /// operand. class VPReductionPHIRecipe : public VPWidenPHIRecipe { /// Descriptor for the reduction. - RecurrenceDescriptor &RdxDesc; + const RecurrenceDescriptor &RdxDesc; /// The phi is part of an in-loop reduction. bool IsInLoop; @@ -1180,7 +1184,7 @@ class VPReductionPHIRecipe : public VPWidenPHIRecipe { public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. - VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, + VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, bool IsOrdered = false) : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start), @@ -1210,7 +1214,9 @@ public: VPSlotTracker &SlotTracker) const override; #endif - RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; } + const RecurrenceDescriptor &getRecurrenceDescriptor() const { + return RdxDesc; + } /// Returns true, if the phi is part of an ordered reduction. bool isOrdered() const { return IsOrdered; } @@ -1340,13 +1346,13 @@ public: /// The Operands are {ChainOp, VecOp, [Condition]}. class VPReductionRecipe : public VPRecipeBase, public VPValue { /// The recurrence decriptor for the reduction in question. - RecurrenceDescriptor *RdxDesc; + const RecurrenceDescriptor *RdxDesc; /// Pointer to the TTI, needed to create the target reduction const TargetTransformInfo *TTI; public: - VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, - VPValue *VecOp, VPValue *CondOp, + VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I, + VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, const TargetTransformInfo *TTI) : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}), VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) { @@ -2252,6 +2258,12 @@ public: return map_range(Operands, Fn); } + /// Returns true if \p VPV is uniform after vectorization. + bool isUniformAfterVectorization(VPValue *VPV) const { + auto RepR = dyn_cast_or_null<VPReplicateRecipe>(VPV->getDef()); + return !VPV->getDef() || (RepR && RepR->isUniform()); + } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. @@ -2340,18 +2352,23 @@ public: /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p - /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr - /// has more than one successor, its conditional bit is propagated to \p - /// NewBlock. \p NewBlock must have neither successors nor predecessors. + /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's + /// successors are moved from \p BlockPtr to \p NewBlock and \p BlockPtr's + /// conditional bit is propagated to \p NewBlock. \p NewBlock must have + /// neither successors nor predecessors. static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { assert(NewBlock->getSuccessors().empty() && - "Can't insert new block with successors."); - // TODO: move successors from BlockPtr to NewBlock when this functionality - // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr - // already has successors. - BlockPtr->setOneSuccessor(NewBlock); - NewBlock->setPredecessors({BlockPtr}); + NewBlock->getPredecessors().empty() && + "Can't insert new block with predecessors or successors."); NewBlock->setParent(BlockPtr->getParent()); + SmallVector<VPBlockBase *> Succs(BlockPtr->successors()); + for (VPBlockBase *Succ : Succs) { + disconnectBlocks(BlockPtr, Succ); + connectBlocks(NewBlock, Succ); + } + NewBlock->setCondBit(BlockPtr->getCondBit()); + BlockPtr->setCondBit(nullptr); + connectBlocks(BlockPtr, NewBlock); } /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p @@ -2394,6 +2411,31 @@ public: To->removePredecessor(From); } + /// Try to merge \p Block into its single predecessor, if \p Block is a + /// VPBasicBlock and its predecessor has a single successor. Returns a pointer + /// to the predecessor \p Block was merged into or nullptr otherwise. + static VPBasicBlock *tryToMergeBlockIntoPredecessor(VPBlockBase *Block) { + auto *VPBB = dyn_cast<VPBasicBlock>(Block); + auto *PredVPBB = + dyn_cast_or_null<VPBasicBlock>(Block->getSinglePredecessor()); + if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1) + return nullptr; + + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) + R.moveBefore(*PredVPBB, PredVPBB->end()); + VPBlockUtils::disconnectBlocks(PredVPBB, VPBB); + auto *ParentRegion = cast<VPRegionBlock>(Block->getParent()); + if (ParentRegion->getExit() == Block) + ParentRegion->setExit(PredVPBB); + SmallVector<VPBlockBase *> Successors(Block->successors()); + for (auto *Succ : Successors) { + VPBlockUtils::disconnectBlocks(Block, Succ); + VPBlockUtils::connectBlocks(PredVPBB, Succ); + } + delete Block; + return PredVPBB; + } + /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge. static bool isBackEdge(const VPBlockBase *FromBlock, const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp index ac3b3505dc34..86ecd6817873 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp @@ -50,14 +50,14 @@ VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB, case EdgeType::FALSE_EDGE: // CurrBB is the False successor of PredBB - compute not of CBV. - IntermediateVal = Builder.createNot(CBV); + IntermediateVal = Builder.createNot(CBV, {}); break; } // Now AND intermediate value with PredBB's block predicate if it has one. VPValue *BP = PredBB->getPredicate(); if (BP) - return Builder.createAnd(BP, IntermediateVal); + return Builder.createAnd(BP, IntermediateVal, {}); else return IntermediateVal; } @@ -96,7 +96,7 @@ VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) { Worklist.pop_front(); // Create an OR of these values. - VPValue *Or = Builder.createOr(LHS, RHS); + VPValue *Or = Builder.createOr(LHS, RHS, {}); // Push OR to the back of the worklist. Worklist.push_back(Or); diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index c52c8a2229e8..9e19e172dea5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -467,8 +467,9 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { return markFailed(); assert(CombinedOperands.size() > 0 && "Need more some operands"); - auto *VPI = new VPInstruction(Opcode, CombinedOperands); - VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); + auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr(); + auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc()); + VPI->setUnderlyingInstr(Inst); LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << *cast<VPInstruction>(Values[0]) << "\n"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ded5bc04beb5..d2daf558c2c5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -18,7 +18,8 @@ using namespace llvm; void VPlanTransforms::VPInstructionsToVPRecipes( Loop *OrigLoop, VPlanPtr &Plan, - LoopVectorizationLegality::InductionList &Inductions, + function_ref<const InductionDescriptor *(PHINode *)> + GetIntOrFpInductionDescriptor, SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE) { auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry()); @@ -44,11 +45,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPRecipeBase *NewRecipe = nullptr; if (auto *VPPhi = dyn_cast<VPWidenPHIRecipe>(&Ingredient)) { auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue()); - InductionDescriptor II = Inductions.lookup(Phi); - if (II.getKind() == InductionDescriptor::IK_IntInduction || - II.getKind() == InductionDescriptor::IK_FpInduction) { - VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); - NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr); + if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) { + VPValue *Start = Plan->getOrAddVPValue(II->getStartValue()); + NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -158,8 +157,7 @@ bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) { // TODO: add ".cloned" suffix to name of Clone's VPValue. Clone->insertBefore(SinkCandidate); - SmallVector<VPUser *, 4> Users(SinkCandidate->user_begin(), - SinkCandidate->user_end()); + SmallVector<VPUser *, 4> Users(SinkCandidate->users()); for (auto *U : Users) { auto *UI = cast<VPRecipeBase>(U); if (UI->getParent() == SinkTo) @@ -266,8 +264,7 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) { VPValue *PredInst1 = cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0); VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue(); - SmallVector<VPUser *> Users(Phi1ToMoveV->user_begin(), - Phi1ToMoveV->user_end()); + SmallVector<VPUser *> Users(Phi1ToMoveV->users()); for (VPUser *U : Users) { auto *UI = dyn_cast<VPRecipeBase>(U); if (!UI || UI->getParent() != Then2) @@ -295,3 +292,35 @@ bool VPlanTransforms::mergeReplicateRegions(VPlan &Plan) { delete ToDelete; return Changed; } + +void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) { + SmallVector<std::pair<VPRecipeBase *, VPValue *>> CastsToRemove; + for (auto &Phi : Plan.getEntry()->getEntryBasicBlock()->phis()) { + auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); + if (!IV || IV->getTruncInst()) + continue; + + // Visit all casts connected to IV and in Casts. Collect them. + // remember them for removal. + auto &Casts = IV->getInductionDescriptor().getCastInsts(); + VPValue *FindMyCast = IV; + for (Instruction *IRCast : reverse(Casts)) { + VPRecipeBase *FoundUserCast = nullptr; + for (auto *U : FindMyCast->users()) { + auto *UserCast = cast<VPRecipeBase>(U); + if (UserCast->getNumDefinedValues() == 1 && + UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) { + FoundUserCast = UserCast; + break; + } + } + assert(FoundUserCast && "Missing a cast to remove"); + CastsToRemove.emplace_back(FoundUserCast, IV); + FindMyCast = FoundUserCast->getVPSingleValue(); + } + } + for (auto &E : CastsToRemove) { + E.first->getVPSingleValue()->replaceAllUsesWith(E.second); + E.first->eraseFromParent(); + } +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index c740f2c022da..a82a562d5e35 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -14,24 +14,37 @@ #define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H #include "VPlan.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" namespace llvm { +class InductionDescriptor; class Instruction; +class PHINode; class ScalarEvolution; struct VPlanTransforms { /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. - static void VPInstructionsToVPRecipes( - Loop *OrigLoop, VPlanPtr &Plan, - LoopVectorizationLegality::InductionList &Inductions, - SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE); + static void + VPInstructionsToVPRecipes(Loop *OrigLoop, VPlanPtr &Plan, + function_ref<const InductionDescriptor *(PHINode *)> + GetIntOrFpInductionDescriptor, + SmallPtrSetImpl<Instruction *> &DeadInstructions, + ScalarEvolution &SE); static bool sinkScalarOperands(VPlan &Plan); static bool mergeReplicateRegions(VPlan &Plan); + + /// Remove redundant casts of inductions. + /// + /// Such redundant casts are casts of induction variables that can be ignored, + /// because we already proved that the casted phi is equal to the uncasted phi + /// in the vectorized loop. There is no need to vectorize the cast - the same + /// value can be used for both the phi and casts in the vector loop. + static void removeRedundantInductionCasts(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 6d6ea4eb30f1..7732d9367985 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -156,5 +156,31 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) { RecipeI++; } } + + const VPRegionBlock *TopRegion = cast<VPRegionBlock>(Plan.getEntry()); + const VPBasicBlock *Entry = dyn_cast<VPBasicBlock>(TopRegion->getEntry()); + if (!Entry) { + errs() << "VPlan entry block is not a VPBasicBlock\n"; + return false; + } + const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit()); + if (!Exit) { + errs() << "VPlan exit block is not a VPBasicBlock\n"; + return false; + } + + for (const VPRegionBlock *Region : + VPBlockUtils::blocksOnly<const VPRegionBlock>( + depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>( + Plan.getEntry())))) { + if (Region->getEntry()->getNumPredecessors() != 0) { + errs() << "region entry block has predecessors\n"; + return false; + } + if (Region->getExit()->getNumSuccessors() != 0) { + errs() << "region exit block has successors\n"; + return false; + } + } return true; } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 57b11e9414ba..c0aedab2fed0 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -989,9 +989,9 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { if (!FixedVT) return false; - InstructionCost OriginalCost = TTI.getMemoryOpCost( - Instruction::Load, LI->getType(), Align(LI->getAlignment()), - LI->getPointerAddressSpace()); + InstructionCost OriginalCost = + TTI.getMemoryOpCost(Instruction::Load, LI->getType(), LI->getAlign(), + LI->getPointerAddressSpace()); InstructionCost ScalarizedCost = 0; Instruction *LastCheckedInst = LI; |
