diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms')
220 files changed, 25355 insertions, 7555 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 06222d7e7e44..59b94567a9c2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -121,14 +122,13 @@ static bool foldGuardedRotateToFunnelShift(Instruction &I) { BasicBlock *GuardBB = Phi.getIncomingBlock(RotSrc == P1); BasicBlock *RotBB = Phi.getIncomingBlock(RotSrc != P1); Instruction *TermI = GuardBB->getTerminator(); - BasicBlock *TrueBB, *FalseBB; ICmpInst::Predicate Pred; - if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()), TrueBB, - FalseBB))) + BasicBlock *PhiBB = Phi.getParent(); + if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(RotAmt), m_ZeroInt()), + m_SpecificBB(PhiBB), m_SpecificBB(RotBB)))) return false; - BasicBlock *PhiBB = Phi.getParent(); - if (Pred != CmpInst::ICMP_EQ || TrueBB != PhiBB || FalseBB != RotBB) + if (Pred != CmpInst::ICMP_EQ) return false; // We matched a variation of this IR pattern: @@ -251,6 +251,72 @@ static bool foldAnyOrAllBitsSet(Instruction &I) { return true; } +// Try to recognize below function as popcount intrinsic. +// This is the "best" algorithm from +// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel +// Also used in TargetLowering::expandCTPOP(). +// +// int popcount(unsigned int i) { +// i = i - ((i >> 1) & 0x55555555); +// i = (i & 0x33333333) + ((i >> 2) & 0x33333333); +// i = ((i + (i >> 4)) & 0x0F0F0F0F); +// return (i * 0x01010101) >> 24; +// } +static bool tryToRecognizePopCount(Instruction &I) { + if (I.getOpcode() != Instruction::LShr) + return false; + + Type *Ty = I.getType(); + if (!Ty->isIntOrIntVectorTy()) + return false; + + unsigned Len = Ty->getScalarSizeInBits(); + // FIXME: fix Len == 8 and other irregular type lengths. + if (!(Len <= 128 && Len > 8 && Len % 8 == 0)) + return false; + + APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55)); + APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33)); + APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F)); + APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01)); + APInt MaskShift = APInt(Len, Len - 8); + + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + Value *MulOp0; + // Matching "(i * 0x01010101...) >> 24". + if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) && + match(Op1, m_SpecificInt(MaskShift))) { + Value *ShiftOp0; + // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)". + if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)), + m_Deferred(ShiftOp0)), + m_SpecificInt(Mask0F)))) { + Value *AndOp0; + // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)". + if (match(ShiftOp0, + m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)), + m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)), + m_SpecificInt(Mask33))))) { + Value *Root, *SubOp1; + // Matching "i - ((i >> 1) & 0x55555555...)". + if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) && + match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)), + m_SpecificInt(Mask55)))) { + LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n"); + IRBuilder<> Builder(&I); + Function *Func = Intrinsic::getDeclaration( + I.getModule(), Intrinsic::ctpop, I.getType()); + I.replaceAllUsesWith(Builder.CreateCall(Func, {Root})); + return true; + } + } + } + } + + return false; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -269,6 +335,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) { for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldAnyOrAllBitsSet(I); MadeChange |= foldGuardedRotateToFunnelShift(I); + MadeChange |= tryToRecognizePopCount(I); } } @@ -303,7 +370,7 @@ void AggressiveInstCombinerLegacyPass::getAnalysisUsage( } bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) { - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); return runImpl(F, TLI, DT); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp new file mode 100644 index 000000000000..7c5e90cb53cd --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/CFGuard/CFGuard.cpp @@ -0,0 +1,305 @@ +//===-- CFGuard.cpp - Control Flow Guard checks -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains the IR transform to add Microsoft's Control Flow Guard +/// checks on Windows targets. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/CFGuard.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +using namespace llvm; + +using OperandBundleDef = OperandBundleDefT<Value *>; + +#define DEBUG_TYPE "cfguard" + +STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added"); + +namespace { + +/// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes. +/// These checks ensure that the target address corresponds to the start of an +/// address-taken function. X86_64 targets use the CF_Dispatch mechanism. X86, +/// ARM, and AArch64 targets use the CF_Check machanism. +class CFGuard : public FunctionPass { +public: + static char ID; + + enum Mechanism { CF_Check, CF_Dispatch }; + + // Default constructor required for the INITIALIZE_PASS macro. + CFGuard() : FunctionPass(ID) { + initializeCFGuardPass(*PassRegistry::getPassRegistry()); + // By default, use the guard check mechanism. + GuardMechanism = CF_Check; + } + + // Recommended constructor used to specify the type of guard mechanism. + CFGuard(Mechanism Var) : FunctionPass(ID) { + initializeCFGuardPass(*PassRegistry::getPassRegistry()); + GuardMechanism = Var; + } + + /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG + /// check mechanism. When the image is loaded, the loader puts the appropriate + /// guard check function pointer in the __guard_check_icall_fptr global + /// symbol. This checks that the target address is a valid address-taken + /// function. The address of the target function is passed to the guard check + /// function in an architecture-specific register (e.g. ECX on 32-bit X86, + /// X15 on Aarch64, and R0 on ARM). The guard check function has no return + /// value (if the target is invalid, the guard check funtion will raise an + /// error). + /// + /// For example, the following LLVM IR: + /// \code + /// %func_ptr = alloca i32 ()*, align 8 + /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8 + /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8 + /// %1 = call i32 %0() + /// \endcode + /// + /// is transformed to: + /// \code + /// %func_ptr = alloca i32 ()*, align 8 + /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8 + /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8 + /// %1 = load void (i8*)*, void (i8*)** @__guard_check_icall_fptr + /// %2 = bitcast i32 ()* %0 to i8* + /// call cfguard_checkcc void %1(i8* %2) + /// %3 = call i32 %0() + /// \endcode + /// + /// For example, the following X86 assembly code: + /// \code + /// movl $_target_func, %eax + /// calll *%eax + /// \endcode + /// + /// is transformed to: + /// \code + /// movl $_target_func, %ecx + /// calll *___guard_check_icall_fptr + /// calll *%ecx + /// \endcode + /// + /// \param CB indirect call to instrument. + void insertCFGuardCheck(CallBase *CB); + + /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG + /// dispatch mechanism. When the image is loaded, the loader puts the + /// appropriate guard check function pointer in the + /// __guard_dispatch_icall_fptr global symbol. This checks that the target + /// address is a valid address-taken function and, if so, tail calls the + /// target. The target address is passed in an architecture-specific register + /// (e.g. RAX on X86_64), with all other arguments for the target function + /// passed as usual. + /// + /// For example, the following LLVM IR: + /// \code + /// %func_ptr = alloca i32 ()*, align 8 + /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8 + /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8 + /// %1 = call i32 %0() + /// \endcode + /// + /// is transformed to: + /// \code + /// %func_ptr = alloca i32 ()*, align 8 + /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8 + /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8 + /// %1 = load i32 ()*, i32 ()** @__guard_dispatch_icall_fptr + /// %2 = call i32 %1() [ "cfguardtarget"(i32 ()* %0) ] + /// \endcode + /// + /// For example, the following X86_64 assembly code: + /// \code + /// leaq target_func(%rip), %rax + /// callq *%rax + /// \endcode + /// + /// is transformed to: + /// \code + /// leaq target_func(%rip), %rax + /// callq *__guard_dispatch_icall_fptr(%rip) + /// \endcode + /// + /// \param CB indirect call to instrument. + void insertCFGuardDispatch(CallBase *CB); + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + +private: + // Only add checks if the module has the cfguard=2 flag. + int cfguard_module_flag = 0; + Mechanism GuardMechanism = CF_Check; + FunctionType *GuardFnType = nullptr; + PointerType *GuardFnPtrType = nullptr; + Constant *GuardFnGlobal = nullptr; +}; + +} // end anonymous namespace + +void CFGuard::insertCFGuardCheck(CallBase *CB) { + + assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() && + "Only applicable for Windows targets"); + assert(CB->isIndirectCall() && + "Control Flow Guard checks can only be added to indirect calls"); + + IRBuilder<> B(CB); + Value *CalledOperand = CB->getCalledOperand(); + + // Load the global symbol as a pointer to the check function. + LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal); + + // Create new call instruction. The CFGuard check should always be a call, + // even if the original CallBase is an Invoke or CallBr instruction. + CallInst *GuardCheck = + B.CreateCall(GuardFnType, GuardCheckLoad, + {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())}); + + // Ensure that the first argument is passed in the correct register + // (e.g. ECX on 32-bit X86 targets). + GuardCheck->setCallingConv(CallingConv::CFGuard_Check); +} + +void CFGuard::insertCFGuardDispatch(CallBase *CB) { + + assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() && + "Only applicable for Windows targets"); + assert(CB->isIndirectCall() && + "Control Flow Guard checks can only be added to indirect calls"); + + IRBuilder<> B(CB); + Value *CalledOperand = CB->getCalledOperand(); + Type *CalledOperandType = CalledOperand->getType(); + + // Cast the guard dispatch global to the type of the called operand. + PointerType *PTy = PointerType::get(CalledOperandType, 0); + if (GuardFnGlobal->getType() != PTy) + GuardFnGlobal = ConstantExpr::getBitCast(GuardFnGlobal, PTy); + + // Load the global as a pointer to a function of the same type. + LoadInst *GuardDispatchLoad = B.CreateLoad(CalledOperandType, GuardFnGlobal); + + // Add the original call target as a cfguardtarget operand bundle. + SmallVector<llvm::OperandBundleDef, 1> Bundles; + CB->getOperandBundlesAsDefs(Bundles); + Bundles.emplace_back("cfguardtarget", CalledOperand); + + // Create a copy of the call/invoke instruction and add the new bundle. + CallBase *NewCB; + if (CallInst *CI = dyn_cast<CallInst>(CB)) { + NewCB = CallInst::Create(CI, Bundles, CB); + } else { + assert(isa<InvokeInst>(CB) && "Unknown indirect call type"); + InvokeInst *II = cast<InvokeInst>(CB); + NewCB = llvm::InvokeInst::Create(II, Bundles, CB); + } + + // Change the target of the call to be the guard dispatch function. + NewCB->setCalledOperand(GuardDispatchLoad); + + // Replace the original call/invoke with the new instruction. + CB->replaceAllUsesWith(NewCB); + + // Delete the original call/invoke. + CB->eraseFromParent(); +} + +bool CFGuard::doInitialization(Module &M) { + + // Check if this module has the cfguard flag and read its value. + if (auto *MD = + mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard"))) + cfguard_module_flag = MD->getZExtValue(); + + // Skip modules for which CFGuard checks have been disabled. + if (cfguard_module_flag != 2) + return false; + + // Set up prototypes for the guard check and dispatch functions. + GuardFnType = FunctionType::get(Type::getVoidTy(M.getContext()), + {Type::getInt8PtrTy(M.getContext())}, false); + GuardFnPtrType = PointerType::get(GuardFnType, 0); + + // Get or insert the guard check or dispatch global symbols. + if (GuardMechanism == CF_Check) { + GuardFnGlobal = + M.getOrInsertGlobal("__guard_check_icall_fptr", GuardFnPtrType); + } else { + assert(GuardMechanism == CF_Dispatch && "Invalid CFGuard mechanism"); + GuardFnGlobal = + M.getOrInsertGlobal("__guard_dispatch_icall_fptr", GuardFnPtrType); + } + + return true; +} + +bool CFGuard::runOnFunction(Function &F) { + + // Skip modules for which CFGuard checks have been disabled. + if (cfguard_module_flag != 2) + return false; + + SmallVector<CallBase *, 8> IndirectCalls; + + // Iterate over the instructions to find all indirect call/invoke/callbr + // instructions. Make a separate list of pointers to indirect + // call/invoke/callbr instructions because the original instructions will be + // deleted as the checks are added. + for (BasicBlock &BB : F.getBasicBlockList()) { + for (Instruction &I : BB.getInstList()) { + auto *CB = dyn_cast<CallBase>(&I); + if (CB && CB->isIndirectCall() && !CB->hasFnAttr("guard_nocf")) { + IndirectCalls.push_back(CB); + CFGuardCounter++; + } + } + } + + // If no checks are needed, return early. + if (IndirectCalls.empty()) { + return false; + } + + // For each indirect call/invoke, add the appropriate dispatch or check. + if (GuardMechanism == CF_Dispatch) { + for (CallBase *CB : IndirectCalls) { + insertCFGuardDispatch(CB); + } + } else { + for (CallBase *CB : IndirectCalls) { + insertCFGuardCheck(CB); + } + } + + return true; +} + +char CFGuard::ID = 0; +INITIALIZE_PASS(CFGuard, "CFGuard", "CFGuard", false, false) + +FunctionPass *llvm::createCFGuardCheckPass() { + return new CFGuard(CFGuard::CF_Check); +} + +FunctionPass *llvm::createCFGuardDispatchPass() { + return new CFGuard(CFGuard::CF_Dispatch); +}
\ No newline at end of file diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp index 1fb0a114d0c7..c2dbd6f41642 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -73,6 +73,8 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) { II->replaceAllUsesWith(ConstantInt::getTrue(Context)); break; case Intrinsic::coro_id: + case Intrinsic::coro_id_retcon: + case Intrinsic::coro_id_retcon_once: II->replaceAllUsesWith(ConstantTokenNone::get(Context)); break; case Intrinsic::coro_subfn_addr: @@ -97,11 +99,11 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) { namespace { -struct CoroCleanup : FunctionPass { +struct CoroCleanupLegacy : FunctionPass { static char ID; // Pass identification, replacement for typeid - CoroCleanup() : FunctionPass(ID) { - initializeCoroCleanupPass(*PassRegistry::getPassRegistry()); + CoroCleanupLegacy() : FunctionPass(ID) { + initializeCoroCleanupLegacyPass(*PassRegistry::getPassRegistry()); } std::unique_ptr<Lowerer> L; @@ -111,8 +113,9 @@ struct CoroCleanup : FunctionPass { bool doInitialization(Module &M) override { if (coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.subfn.addr", "llvm.coro.free", - "llvm.coro.id"})) - L = llvm::make_unique<Lowerer>(M); + "llvm.coro.id", "llvm.coro.id.retcon", + "llvm.coro.id.retcon.once"})) + L = std::make_unique<Lowerer>(M); return false; } @@ -129,8 +132,8 @@ struct CoroCleanup : FunctionPass { }; } -char CoroCleanup::ID = 0; -INITIALIZE_PASS(CoroCleanup, "coro-cleanup", +char CoroCleanupLegacy::ID = 0; +INITIALIZE_PASS(CoroCleanupLegacy, "coro-cleanup", "Lower all coroutine related intrinsics", false, false) -Pass *llvm::createCoroCleanupPass() { return new CoroCleanup(); } +Pass *llvm::createCoroCleanupLegacyPass() { return new CoroCleanupLegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp index 692697d6f32e..e73fb9eeb1e9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroEarly.cpp @@ -22,7 +22,7 @@ using namespace llvm; #define DEBUG_TYPE "coro-early" namespace { -// Created on demand if CoroEarly pass has work to do. +// Created on demand if the coro-early pass has work to do. class Lowerer : public coro::LowererBase { IRBuilder<> Builder; PointerType *const AnyResumeFnPtrTy; @@ -91,13 +91,14 @@ void Lowerer::lowerCoroDone(IntrinsicInst *II) { Value *Operand = II->getArgOperand(0); // ResumeFnAddr is the first pointer sized element of the coroutine frame. + static_assert(coro::Shape::SwitchFieldIndex::Resume == 0, + "resume function not at offset zero"); auto *FrameTy = Int8Ptr; PointerType *FramePtrTy = FrameTy->getPointerTo(); Builder.SetInsertPoint(II); auto *BCI = Builder.CreateBitCast(Operand, FramePtrTy); - auto *Gep = Builder.CreateConstInBoundsGEP1_32(FrameTy, BCI, 0); - auto *Load = Builder.CreateLoad(FrameTy, Gep); + auto *Load = Builder.CreateLoad(BCI); auto *Cond = Builder.CreateICmpEQ(Load, NullPtr); II->replaceAllUsesWith(Cond); @@ -189,6 +190,10 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) { } } break; + case Intrinsic::coro_id_retcon: + case Intrinsic::coro_id_retcon_once: + F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT); + break; case Intrinsic::coro_resume: lowerResumeOrDestroy(CS, CoroSubFnInst::ResumeIndex); break; @@ -220,10 +225,10 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) { namespace { -struct CoroEarly : public FunctionPass { +struct CoroEarlyLegacy : public FunctionPass { static char ID; // Pass identification, replacement for typeid. - CoroEarly() : FunctionPass(ID) { - initializeCoroEarlyPass(*PassRegistry::getPassRegistry()); + CoroEarlyLegacy() : FunctionPass(ID) { + initializeCoroEarlyLegacyPass(*PassRegistry::getPassRegistry()); } std::unique_ptr<Lowerer> L; @@ -231,11 +236,18 @@ struct CoroEarly : public FunctionPass { // This pass has work to do only if we find intrinsics we are going to lower // in the module. bool doInitialization(Module &M) override { - if (coro::declaresIntrinsics( - M, {"llvm.coro.id", "llvm.coro.destroy", "llvm.coro.done", - "llvm.coro.end", "llvm.coro.noop", "llvm.coro.free", - "llvm.coro.promise", "llvm.coro.resume", "llvm.coro.suspend"})) - L = llvm::make_unique<Lowerer>(M); + if (coro::declaresIntrinsics(M, {"llvm.coro.id", + "llvm.coro.id.retcon", + "llvm.coro.id.retcon.once", + "llvm.coro.destroy", + "llvm.coro.done", + "llvm.coro.end", + "llvm.coro.noop", + "llvm.coro.free", + "llvm.coro.promise", + "llvm.coro.resume", + "llvm.coro.suspend"})) + L = std::make_unique<Lowerer>(M); return false; } @@ -255,8 +267,8 @@ struct CoroEarly : public FunctionPass { }; } -char CoroEarly::ID = 0; -INITIALIZE_PASS(CoroEarly, "coro-early", "Lower early coroutine intrinsics", - false, false) +char CoroEarlyLegacy::ID = 0; +INITIALIZE_PASS(CoroEarlyLegacy, "coro-early", + "Lower early coroutine intrinsics", false, false) -Pass *llvm::createCoroEarlyPass() { return new CoroEarly(); } +Pass *llvm::createCoroEarlyLegacyPass() { return new CoroEarlyLegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp index 6707aa1c827d..23d22e23861a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroElide.cpp @@ -15,6 +15,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InstIterator.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" @@ -23,7 +24,7 @@ using namespace llvm; #define DEBUG_TYPE "coro-elide" namespace { -// Created on demand if CoroElide pass has work to do. +// Created on demand if the coro-elide pass has work to do. struct Lowerer : coro::LowererBase { SmallVector<CoroIdInst *, 4> CoroIds; SmallVector<CoroBeginInst *, 1> CoroBegins; @@ -276,17 +277,17 @@ static bool replaceDevirtTrigger(Function &F) { //===----------------------------------------------------------------------===// namespace { -struct CoroElide : FunctionPass { +struct CoroElideLegacy : FunctionPass { static char ID; - CoroElide() : FunctionPass(ID) { - initializeCoroElidePass(*PassRegistry::getPassRegistry()); + CoroElideLegacy() : FunctionPass(ID) { + initializeCoroElideLegacyPass(*PassRegistry::getPassRegistry()); } std::unique_ptr<Lowerer> L; bool doInitialization(Module &M) override { if (coro::declaresIntrinsics(M, {"llvm.coro.id"})) - L = llvm::make_unique<Lowerer>(M); + L = std::make_unique<Lowerer>(M); return false; } @@ -329,15 +330,15 @@ struct CoroElide : FunctionPass { }; } -char CoroElide::ID = 0; +char CoroElideLegacy::ID = 0; INITIALIZE_PASS_BEGIN( - CoroElide, "coro-elide", + CoroElideLegacy, "coro-elide", "Coroutine frame allocation elision and indirect calls replacement", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END( - CoroElide, "coro-elide", + CoroElideLegacy, "coro-elide", "Coroutine frame allocation elision and indirect calls replacement", false, false) -Pass *llvm::createCoroElidePass() { return new CoroElide(); } +Pass *llvm::createCoroElideLegacyPass() { return new CoroElideLegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 58bf22bee29b..2c42cf8a6d25 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -18,6 +18,7 @@ #include "CoroInternal.h" #include "llvm/ADT/BitVector.h" +#include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/CFG.h" @@ -28,6 +29,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/circular_raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" using namespace llvm; @@ -120,6 +122,15 @@ struct SuspendCrossingInfo { return false; BasicBlock *UseBB = I->getParent(); + + // As a special case, treat uses by an llvm.coro.suspend.retcon + // as if they were uses in the suspend's single predecessor: the + // uses conceptually occur before the suspend. + if (isa<CoroSuspendRetconInst>(I)) { + UseBB = UseBB->getSinglePredecessor(); + assert(UseBB && "should have split coro.suspend into its own block"); + } + return hasPathCrossingSuspendPoint(DefBB, UseBB); } @@ -128,7 +139,17 @@ struct SuspendCrossingInfo { } bool isDefinitionAcrossSuspend(Instruction &I, User *U) const { - return isDefinitionAcrossSuspend(I.getParent(), U); + auto *DefBB = I.getParent(); + + // As a special case, treat values produced by an llvm.coro.suspend.* + // as if they were defined in the single successor: the uses + // conceptually occur after the suspend. + if (isa<AnyCoroSuspendInst>(I)) { + DefBB = DefBB->getSingleSuccessor(); + assert(DefBB && "should have split coro.suspend into its own block"); + } + + return isDefinitionAcrossSuspend(DefBB, U); } }; } // end anonymous namespace @@ -183,9 +204,10 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) B.Suspend = true; B.Kills |= B.Consumes; }; - for (CoroSuspendInst *CSI : Shape.CoroSuspends) { + for (auto *CSI : Shape.CoroSuspends) { markSuspendBlock(CSI); - markSuspendBlock(CSI->getCoroSave()); + if (auto *Save = CSI->getCoroSave()) + markSuspendBlock(Save); } // Iterate propagating consumes and kills until they stop changing. @@ -261,11 +283,13 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) // We build up the list of spills for every case where a use is separated // from the definition by a suspend point. +static const unsigned InvalidFieldIndex = ~0U; + namespace { class Spill { Value *Def = nullptr; Instruction *User = nullptr; - unsigned FieldNo = 0; + unsigned FieldNo = InvalidFieldIndex; public: Spill(Value *Def, llvm::User *U) : Def(Def), User(cast<Instruction>(U)) {} @@ -280,11 +304,11 @@ public: // the definition the first time they encounter it. Consider refactoring // SpillInfo into two arrays to normalize the spill representation. unsigned fieldIndex() const { - assert(FieldNo && "Accessing unassigned field"); + assert(FieldNo != InvalidFieldIndex && "Accessing unassigned field"); return FieldNo; } void setFieldIndex(unsigned FieldNumber) { - assert(!FieldNo && "Reassigning field number"); + assert(FieldNo == InvalidFieldIndex && "Reassigning field number"); FieldNo = FieldNumber; } }; @@ -376,18 +400,30 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape, SmallString<32> Name(F.getName()); Name.append(".Frame"); StructType *FrameTy = StructType::create(C, Name); - auto *FramePtrTy = FrameTy->getPointerTo(); - auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy, - /*isVarArg=*/false); - auto *FnPtrTy = FnTy->getPointerTo(); - - // Figure out how wide should be an integer type storing the suspend index. - unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size())); - Type *PromiseType = Shape.PromiseAlloca - ? Shape.PromiseAlloca->getType()->getElementType() - : Type::getInt1Ty(C); - SmallVector<Type *, 8> Types{FnPtrTy, FnPtrTy, PromiseType, - Type::getIntNTy(C, IndexBits)}; + SmallVector<Type *, 8> Types; + + AllocaInst *PromiseAlloca = Shape.getPromiseAlloca(); + + if (Shape.ABI == coro::ABI::Switch) { + auto *FramePtrTy = FrameTy->getPointerTo(); + auto *FnTy = FunctionType::get(Type::getVoidTy(C), FramePtrTy, + /*IsVarArg=*/false); + auto *FnPtrTy = FnTy->getPointerTo(); + + // Figure out how wide should be an integer type storing the suspend index. + unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size())); + Type *PromiseType = PromiseAlloca + ? PromiseAlloca->getType()->getElementType() + : Type::getInt1Ty(C); + Type *IndexType = Type::getIntNTy(C, IndexBits); + Types.push_back(FnPtrTy); + Types.push_back(FnPtrTy); + Types.push_back(PromiseType); + Types.push_back(IndexType); + } else { + assert(PromiseAlloca == nullptr && "lowering doesn't support promises"); + } + Value *CurrentDef = nullptr; Padder.addTypes(Types); @@ -399,7 +435,7 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape, CurrentDef = S.def(); // PromiseAlloca was already added to Types array earlier. - if (CurrentDef == Shape.PromiseAlloca) + if (CurrentDef == PromiseAlloca) continue; uint64_t Count = 1; @@ -430,9 +466,80 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape, } FrameTy->setBody(Types); + switch (Shape.ABI) { + case coro::ABI::Switch: + break; + + // Remember whether the frame is inline in the storage. + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: { + auto &Layout = F.getParent()->getDataLayout(); + auto Id = Shape.getRetconCoroId(); + Shape.RetconLowering.IsFrameInlineInStorage + = (Layout.getTypeAllocSize(FrameTy) <= Id->getStorageSize() && + Layout.getABITypeAlignment(FrameTy) <= Id->getStorageAlignment()); + break; + } + } + return FrameTy; } +// We use a pointer use visitor to discover if there are any writes into an +// alloca that dominates CoroBegin. If that is the case, insertSpills will copy +// the value from the alloca into the coroutine frame spill slot corresponding +// to that alloca. +namespace { +struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> { + using Base = PtrUseVisitor<AllocaUseVisitor>; + AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT, + const CoroBeginInst &CB) + : PtrUseVisitor(DL), DT(DT), CoroBegin(CB) {} + + // We are only interested in uses that dominate coro.begin. + void visit(Instruction &I) { + if (DT.dominates(&I, &CoroBegin)) + Base::visit(I); + } + // We need to provide this overload as PtrUseVisitor uses a pointer based + // visiting function. + void visit(Instruction *I) { return visit(*I); } + + void visitLoadInst(LoadInst &) {} // Good. Nothing to do. + + // If the use is an operand, the pointer escaped and anything can write into + // that memory. If the use is the pointer, we are definitely writing into the + // alloca and therefore we need to copy. + void visitStoreInst(StoreInst &SI) { PI.setAborted(&SI); } + + // Any other instruction that is not filtered out by PtrUseVisitor, will + // result in the copy. + void visitInstruction(Instruction &I) { PI.setAborted(&I); } + +private: + const DominatorTree &DT; + const CoroBeginInst &CoroBegin; +}; +} // namespace +static bool mightWriteIntoAllocaPtr(AllocaInst &A, const DominatorTree &DT, + const CoroBeginInst &CB) { + const DataLayout &DL = A.getModule()->getDataLayout(); + AllocaUseVisitor Visitor(DL, DT, CB); + auto PtrI = Visitor.visitPtr(A); + if (PtrI.isEscaped() || PtrI.isAborted()) { + auto *PointerEscapingInstr = PtrI.getEscapingInst() + ? PtrI.getEscapingInst() + : PtrI.getAbortingInst(); + if (PointerEscapingInstr) { + LLVM_DEBUG( + dbgs() << "AllocaInst copy was triggered by instruction: " + << *PointerEscapingInstr << "\n"); + } + return true; + } + return false; +} + // We need to make room to insert a spill after initial PHIs, but before // catchswitch instruction. Placing it before violates the requirement that // catchswitch, like all other EHPads must be the first nonPHI in a block. @@ -476,7 +583,7 @@ static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) { // whatever // // -static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { +static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) { auto *CB = Shape.CoroBegin; LLVMContext &C = CB->getContext(); IRBuilder<> Builder(CB->getNextNode()); @@ -484,11 +591,14 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { PointerType *FramePtrTy = FrameTy->getPointerTo(); auto *FramePtr = cast<Instruction>(Builder.CreateBitCast(CB, FramePtrTy, "FramePtr")); + DominatorTree DT(*CB->getFunction()); Value *CurrentValue = nullptr; BasicBlock *CurrentBlock = nullptr; Value *CurrentReload = nullptr; - unsigned Index = 0; // Proper field number will be read from field definition. + + // Proper field number will be read from field definition. + unsigned Index = InvalidFieldIndex; // We need to keep track of any allocas that need "spilling" // since they will live in the coroutine frame now, all access to them @@ -496,9 +606,11 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { // we remember allocas and their indices to be handled once we processed // all the spills. SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas; - // Promise alloca (if present) has a fixed field number (Shape::PromiseField) - if (Shape.PromiseAlloca) - Allocas.emplace_back(Shape.PromiseAlloca, coro::Shape::PromiseField); + // Promise alloca (if present) has a fixed field number. + if (auto *PromiseAlloca = Shape.getPromiseAlloca()) { + assert(Shape.ABI == coro::ABI::Switch); + Allocas.emplace_back(PromiseAlloca, coro::Shape::SwitchFieldIndex::Promise); + } // Create a GEP with the given index into the coroutine frame for the original // value Orig. Appends an extra 0 index for array-allocas, preserving the @@ -526,7 +638,7 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { // Create a load instruction to reload the spilled value from the coroutine // frame. auto CreateReload = [&](Instruction *InsertBefore) { - assert(Index && "accessing unassigned field number"); + assert(Index != InvalidFieldIndex && "accessing unassigned field number"); Builder.SetInsertPoint(InsertBefore); auto *G = GetFramePointer(Index, CurrentValue); @@ -558,29 +670,45 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { // coroutine frame. Instruction *InsertPt = nullptr; - if (isa<Argument>(CurrentValue)) { + if (auto Arg = dyn_cast<Argument>(CurrentValue)) { // For arguments, we will place the store instruction right after // the coroutine frame pointer instruction, i.e. bitcast of // coro.begin from i8* to %f.frame*. InsertPt = FramePtr->getNextNode(); + + // If we're spilling an Argument, make sure we clear 'nocapture' + // from the coroutine function. + Arg->getParent()->removeParamAttr(Arg->getArgNo(), + Attribute::NoCapture); + } else if (auto *II = dyn_cast<InvokeInst>(CurrentValue)) { // If we are spilling the result of the invoke instruction, split the // normal edge and insert the spill in the new block. auto NewBB = SplitEdge(II->getParent(), II->getNormalDest()); InsertPt = NewBB->getTerminator(); - } else if (dyn_cast<PHINode>(CurrentValue)) { + } else if (isa<PHINode>(CurrentValue)) { // Skip the PHINodes and EH pads instructions. BasicBlock *DefBlock = cast<Instruction>(E.def())->getParent(); if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator())) InsertPt = splitBeforeCatchSwitch(CSI); else InsertPt = &*DefBlock->getFirstInsertionPt(); + } else if (auto CSI = dyn_cast<AnyCoroSuspendInst>(CurrentValue)) { + // Don't spill immediately after a suspend; splitting assumes + // that the suspend will be followed by a branch. + InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHI(); } else { + auto *I = cast<Instruction>(E.def()); + assert(!I->isTerminator() && "unexpected terminator"); // For all other values, the spill is placed immediately after // the definition. - assert(!cast<Instruction>(E.def())->isTerminator() && - "unexpected terminator"); - InsertPt = cast<Instruction>(E.def())->getNextNode(); + if (DT.dominates(CB, I)) { + InsertPt = I->getNextNode(); + } else { + // Unless, it is not dominated by CoroBegin, then it will be + // inserted immediately after CoroFrame is computed. + InsertPt = FramePtr->getNextNode(); + } } Builder.SetInsertPoint(InsertPt); @@ -613,21 +741,53 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { } BasicBlock *FramePtrBB = FramePtr->getParent(); - Shape.AllocaSpillBlock = - FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB"); - Shape.AllocaSpillBlock->splitBasicBlock(&Shape.AllocaSpillBlock->front(), - "PostSpill"); - Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front()); + auto SpillBlock = + FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB"); + SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill"); + Shape.AllocaSpillBlock = SpillBlock; // If we found any allocas, replace all of their remaining uses with Geps. + // Note: we cannot do it indiscriminately as some of the uses may not be + // dominated by CoroBegin. + bool MightNeedToCopy = false; + Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front()); + SmallVector<Instruction *, 4> UsersToUpdate; for (auto &P : Allocas) { - auto *G = GetFramePointer(P.second, P.first); + AllocaInst *const A = P.first; + UsersToUpdate.clear(); + for (User *U : A->users()) { + auto *I = cast<Instruction>(U); + if (DT.dominates(CB, I)) + UsersToUpdate.push_back(I); + else + MightNeedToCopy = true; + } + if (!UsersToUpdate.empty()) { + auto *G = GetFramePointer(P.second, A); + G->takeName(A); + for (Instruction *I : UsersToUpdate) + I->replaceUsesOfWith(A, G); + } + } + // If we discovered such uses not dominated by CoroBegin, see if any of them + // preceed coro begin and have instructions that can modify the + // value of the alloca and therefore would require a copying the value into + // the spill slot in the coroutine frame. + if (MightNeedToCopy) { + Builder.SetInsertPoint(FramePtr->getNextNode()); + + for (auto &P : Allocas) { + AllocaInst *const A = P.first; + if (mightWriteIntoAllocaPtr(*A, DT, *CB)) { + if (A->isArrayAllocation()) + report_fatal_error( + "Coroutines cannot handle copying of array allocas yet"); - // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G)) here, - // as we are changing location of the instruction. - G->takeName(P.first); - P.first->replaceAllUsesWith(G); - P.first->eraseFromParent(); + auto *G = GetFramePointer(P.second, A); + auto *Value = Builder.CreateLoad(A); + Builder.CreateStore(Value, G); + } + } } return FramePtr; } @@ -829,52 +989,6 @@ static void rewriteMaterializableInstructions(IRBuilder<> &IRB, } } -// Move early uses of spilled variable after CoroBegin. -// For example, if a parameter had address taken, we may end up with the code -// like: -// define @f(i32 %n) { -// %n.addr = alloca i32 -// store %n, %n.addr -// ... -// call @coro.begin -// we need to move the store after coro.begin -static void moveSpillUsesAfterCoroBegin(Function &F, SpillInfo const &Spills, - CoroBeginInst *CoroBegin) { - DominatorTree DT(F); - SmallVector<Instruction *, 8> NeedsMoving; - - Value *CurrentValue = nullptr; - - for (auto const &E : Spills) { - if (CurrentValue == E.def()) - continue; - - CurrentValue = E.def(); - - for (User *U : CurrentValue->users()) { - Instruction *I = cast<Instruction>(U); - if (!DT.dominates(CoroBegin, I)) { - LLVM_DEBUG(dbgs() << "will move: " << *I << "\n"); - - // TODO: Make this more robust. Currently if we run into a situation - // where simple instruction move won't work we panic and - // report_fatal_error. - for (User *UI : I->users()) { - if (!DT.dominates(CoroBegin, cast<Instruction>(UI))) - report_fatal_error("cannot move instruction since its users are not" - " dominated by CoroBegin"); - } - - NeedsMoving.push_back(I); - } - } - } - - Instruction *InsertPt = CoroBegin->getNextNode(); - for (Instruction *I : NeedsMoving) - I->moveBefore(InsertPt); -} - // Splits the block at a particular instruction unless it is the first // instruction in the block with a single predecessor. static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) { @@ -895,21 +1009,337 @@ static void splitAround(Instruction *I, const Twine &Name) { splitBlockIfNotFirst(I->getNextNode(), "After" + Name); } +static bool isSuspendBlock(BasicBlock *BB) { + return isa<AnyCoroSuspendInst>(BB->front()); +} + +typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet; + +/// Does control flow starting at the given block ever reach a suspend +/// instruction before reaching a block in VisitedOrFreeBBs? +static bool isSuspendReachableFrom(BasicBlock *From, + VisitedBlocksSet &VisitedOrFreeBBs) { + // Eagerly try to add this block to the visited set. If it's already + // there, stop recursing; this path doesn't reach a suspend before + // either looping or reaching a freeing block. + if (!VisitedOrFreeBBs.insert(From).second) + return false; + + // We assume that we'll already have split suspends into their own blocks. + if (isSuspendBlock(From)) + return true; + + // Recurse on the successors. + for (auto Succ : successors(From)) { + if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs)) + return true; + } + + return false; +} + +/// Is the given alloca "local", i.e. bounded in lifetime to not cross a +/// suspend point? +static bool isLocalAlloca(CoroAllocaAllocInst *AI) { + // Seed the visited set with all the basic blocks containing a free + // so that we won't pass them up. + VisitedBlocksSet VisitedOrFreeBBs; + for (auto User : AI->users()) { + if (auto FI = dyn_cast<CoroAllocaFreeInst>(User)) + VisitedOrFreeBBs.insert(FI->getParent()); + } + + return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs); +} + +/// After we split the coroutine, will the given basic block be along +/// an obvious exit path for the resumption function? +static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, + unsigned depth = 3) { + // If we've bottomed out our depth count, stop searching and assume + // that the path might loop back. + if (depth == 0) return false; + + // If this is a suspend block, we're about to exit the resumption function. + if (isSuspendBlock(BB)) return true; + + // Recurse into the successors. + for (auto Succ : successors(BB)) { + if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1)) + return false; + } + + // If none of the successors leads back in a loop, we're on an exit/abort. + return true; +} + +static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) { + // Look for a free that isn't sufficiently obviously followed by + // either a suspend or a termination, i.e. something that will leave + // the coro resumption frame. + for (auto U : AI->users()) { + auto FI = dyn_cast<CoroAllocaFreeInst>(U); + if (!FI) continue; + + if (!willLeaveFunctionImmediatelyAfter(FI->getParent())) + return true; + } + + // If we never found one, we don't need a stack save. + return false; +} + +/// Turn each of the given local allocas into a normal (dynamic) alloca +/// instruction. +static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas, + SmallVectorImpl<Instruction*> &DeadInsts) { + for (auto AI : LocalAllocas) { + auto M = AI->getModule(); + IRBuilder<> Builder(AI); + + // Save the stack depth. Try to avoid doing this if the stackrestore + // is going to immediately precede a return or something. + Value *StackSave = nullptr; + if (localAllocaNeedsStackSave(AI)) + StackSave = Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::stacksave)); + + // Allocate memory. + auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize()); + Alloca->setAlignment(MaybeAlign(AI->getAlignment())); + + for (auto U : AI->users()) { + // Replace gets with the allocation. + if (isa<CoroAllocaGetInst>(U)) { + U->replaceAllUsesWith(Alloca); + + // Replace frees with stackrestores. This is safe because + // alloca.alloc is required to obey a stack discipline, although we + // don't enforce that structurally. + } else { + auto FI = cast<CoroAllocaFreeInst>(U); + if (StackSave) { + Builder.SetInsertPoint(FI); + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::stackrestore), + StackSave); + } + } + DeadInsts.push_back(cast<Instruction>(U)); + } + + DeadInsts.push_back(AI); + } +} + +/// Turn the given coro.alloca.alloc call into a dynamic allocation. +/// This happens during the all-instructions iteration, so it must not +/// delete the call. +static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI, + coro::Shape &Shape, + SmallVectorImpl<Instruction*> &DeadInsts) { + IRBuilder<> Builder(AI); + auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr); + + for (User *U : AI->users()) { + if (isa<CoroAllocaGetInst>(U)) { + U->replaceAllUsesWith(Alloc); + } else { + auto FI = cast<CoroAllocaFreeInst>(U); + Builder.SetInsertPoint(FI); + Shape.emitDealloc(Builder, Alloc, nullptr); + } + DeadInsts.push_back(cast<Instruction>(U)); + } + + // Push this on last so that it gets deleted after all the others. + DeadInsts.push_back(AI); + + // Return the new allocation value so that we can check for needed spills. + return cast<Instruction>(Alloc); +} + +/// Get the current swifterror value. +static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy, + coro::Shape &Shape) { + // Make a fake function pointer as a sort of intrinsic. + auto FnTy = FunctionType::get(ValueTy, {}, false); + auto Fn = ConstantPointerNull::get(FnTy->getPointerTo()); + + auto Call = Builder.CreateCall(Fn, {}); + Shape.SwiftErrorOps.push_back(Call); + + return Call; +} + +/// Set the given value as the current swifterror value. +/// +/// Returns a slot that can be used as a swifterror slot. +static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V, + coro::Shape &Shape) { + // Make a fake function pointer as a sort of intrinsic. + auto FnTy = FunctionType::get(V->getType()->getPointerTo(), + {V->getType()}, false); + auto Fn = ConstantPointerNull::get(FnTy->getPointerTo()); + + auto Call = Builder.CreateCall(Fn, { V }); + Shape.SwiftErrorOps.push_back(Call); + + return Call; +} + +/// Set the swifterror value from the given alloca before a call, +/// then put in back in the alloca afterwards. +/// +/// Returns an address that will stand in for the swifterror slot +/// until splitting. +static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call, + AllocaInst *Alloca, + coro::Shape &Shape) { + auto ValueTy = Alloca->getAllocatedType(); + IRBuilder<> Builder(Call); + + // Load the current value from the alloca and set it as the + // swifterror value. + auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca); + auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape); + + // Move to after the call. Since swifterror only has a guaranteed + // value on normal exits, we can ignore implicit and explicit unwind + // edges. + if (isa<CallInst>(Call)) { + Builder.SetInsertPoint(Call->getNextNode()); + } else { + auto Invoke = cast<InvokeInst>(Call); + Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg()); + } + + // Get the current swifterror value and store it to the alloca. + auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape); + Builder.CreateStore(ValueAfterCall, Alloca); + + return Addr; +} + +/// Eliminate a formerly-swifterror alloca by inserting the get/set +/// intrinsics and attempting to MemToReg the alloca away. +static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca, + coro::Shape &Shape) { + for (auto UI = Alloca->use_begin(), UE = Alloca->use_end(); UI != UE; ) { + // We're likely changing the use list, so use a mutation-safe + // iteration pattern. + auto &Use = *UI; + ++UI; + + // swifterror values can only be used in very specific ways. + // We take advantage of that here. + auto User = Use.getUser(); + if (isa<LoadInst>(User) || isa<StoreInst>(User)) + continue; + + assert(isa<CallInst>(User) || isa<InvokeInst>(User)); + auto Call = cast<Instruction>(User); + + auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape); + + // Use the returned slot address as the call argument. + Use.set(Addr); + } + + // All the uses should be loads and stores now. + assert(isAllocaPromotable(Alloca)); +} + +/// "Eliminate" a swifterror argument by reducing it to the alloca case +/// and then loading and storing in the prologue and epilog. +/// +/// The argument keeps the swifterror flag. +static void eliminateSwiftErrorArgument(Function &F, Argument &Arg, + coro::Shape &Shape, + SmallVectorImpl<AllocaInst*> &AllocasToPromote) { + IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); + + auto ArgTy = cast<PointerType>(Arg.getType()); + auto ValueTy = ArgTy->getElementType(); + + // Reduce to the alloca case: + + // Create an alloca and replace all uses of the arg with it. + auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace()); + Arg.replaceAllUsesWith(Alloca); + + // Set an initial value in the alloca. swifterror is always null on entry. + auto InitialValue = Constant::getNullValue(ValueTy); + Builder.CreateStore(InitialValue, Alloca); + + // Find all the suspends in the function and save and restore around them. + for (auto Suspend : Shape.CoroSuspends) { + (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape); + } + + // Find all the coro.ends in the function and restore the error value. + for (auto End : Shape.CoroEnds) { + Builder.SetInsertPoint(End); + auto FinalValue = Builder.CreateLoad(ValueTy, Alloca); + (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape); + } + + // Now we can use the alloca logic. + AllocasToPromote.push_back(Alloca); + eliminateSwiftErrorAlloca(F, Alloca, Shape); +} + +/// Eliminate all problematic uses of swifterror arguments and allocas +/// from the function. We'll fix them up later when splitting the function. +static void eliminateSwiftError(Function &F, coro::Shape &Shape) { + SmallVector<AllocaInst*, 4> AllocasToPromote; + + // Look for a swifterror argument. + for (auto &Arg : F.args()) { + if (!Arg.hasSwiftErrorAttr()) continue; + + eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote); + break; + } + + // Look for swifterror allocas. + for (auto &Inst : F.getEntryBlock()) { + auto Alloca = dyn_cast<AllocaInst>(&Inst); + if (!Alloca || !Alloca->isSwiftError()) continue; + + // Clear the swifterror flag. + Alloca->setSwiftError(false); + + AllocasToPromote.push_back(Alloca); + eliminateSwiftErrorAlloca(F, Alloca, Shape); + } + + // If we have any allocas to promote, compute a dominator tree and + // promote them en masse. + if (!AllocasToPromote.empty()) { + DominatorTree DT(F); + PromoteMemToReg(AllocasToPromote, DT); + } +} + void coro::buildCoroutineFrame(Function &F, Shape &Shape) { // Lower coro.dbg.declare to coro.dbg.value, since we are going to rewrite // access to local variables. LowerDbgDeclare(F); - Shape.PromiseAlloca = Shape.CoroBegin->getId()->getPromise(); - if (Shape.PromiseAlloca) { - Shape.CoroBegin->getId()->clearPromise(); + eliminateSwiftError(F, Shape); + + if (Shape.ABI == coro::ABI::Switch && + Shape.SwitchLowering.PromiseAlloca) { + Shape.getSwitchCoroId()->clearPromise(); } // Make sure that all coro.save, coro.suspend and the fallthrough coro.end // intrinsics are in their own blocks to simplify the logic of building up // SuspendCrossing data. - for (CoroSuspendInst *CSI : Shape.CoroSuspends) { - splitAround(CSI->getCoroSave(), "CoroSave"); + for (auto *CSI : Shape.CoroSuspends) { + if (auto *Save = CSI->getCoroSave()) + splitAround(Save, "CoroSave"); splitAround(CSI, "CoroSuspend"); } @@ -926,6 +1356,8 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { IRBuilder<> Builder(F.getContext()); SpillInfo Spills; + SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas; + SmallVector<Instruction*, 4> DeadInstructions; for (int Repeat = 0; Repeat < 4; ++Repeat) { // See if there are materializable instructions across suspend points. @@ -955,11 +1387,40 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { // of the Coroutine Frame. if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin) continue; + // The Coroutine Promise always included into coroutine frame, no need to // check for suspend crossing. - if (Shape.PromiseAlloca == &I) + if (Shape.ABI == coro::ABI::Switch && + Shape.SwitchLowering.PromiseAlloca == &I) continue; + // Handle alloca.alloc specially here. + if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) { + // Check whether the alloca's lifetime is bounded by suspend points. + if (isLocalAlloca(AI)) { + LocalAllocas.push_back(AI); + continue; + } + + // If not, do a quick rewrite of the alloca and then add spills of + // the rewritten value. The rewrite doesn't invalidate anything in + // Spills because the other alloca intrinsics have no other operands + // besides AI, and it doesn't invalidate the iteration because we delay + // erasing AI. + auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions); + + for (User *U : Alloc->users()) { + if (Checker.isDefinitionAcrossSuspend(*Alloc, U)) + Spills.emplace_back(Alloc, U); + } + continue; + } + + // Ignore alloca.get; we process this as part of coro.alloca.alloc. + if (isa<CoroAllocaGetInst>(I)) { + continue; + } + for (User *U : I.users()) if (Checker.isDefinitionAcrossSuspend(I, U)) { // We cannot spill a token. @@ -970,7 +1431,10 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { } } LLVM_DEBUG(dump("Spills", Spills)); - moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin); Shape.FrameTy = buildFrameType(F, Shape, Spills); Shape.FramePtr = insertSpills(Spills, Shape); + lowerLocalAllocas(LocalAllocas, DeadInstructions); + + for (auto I : DeadInstructions) + I->eraseFromParent(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h index 5e19d7642e38..de2d2920cb15 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInstr.h @@ -27,6 +27,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/raw_ostream.h" namespace llvm { @@ -77,10 +78,8 @@ public: } }; -/// This represents the llvm.coro.alloc instruction. -class LLVM_LIBRARY_VISIBILITY CoroIdInst : public IntrinsicInst { - enum { AlignArg, PromiseArg, CoroutineArg, InfoArg }; - +/// This represents a common base class for llvm.coro.id instructions. +class LLVM_LIBRARY_VISIBILITY AnyCoroIdInst : public IntrinsicInst { public: CoroAllocInst *getCoroAlloc() { for (User *U : users()) @@ -97,6 +96,24 @@ public: llvm_unreachable("no coro.begin associated with coro.id"); } + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + auto ID = I->getIntrinsicID(); + return ID == Intrinsic::coro_id || + ID == Intrinsic::coro_id_retcon || + ID == Intrinsic::coro_id_retcon_once; + } + + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + +/// This represents the llvm.coro.id instruction. +class LLVM_LIBRARY_VISIBILITY CoroIdInst : public AnyCoroIdInst { + enum { AlignArg, PromiseArg, CoroutineArg, InfoArg }; + +public: AllocaInst *getPromise() const { Value *Arg = getArgOperand(PromiseArg); return isa<ConstantPointerNull>(Arg) @@ -182,6 +199,80 @@ public: } }; +/// This represents either the llvm.coro.id.retcon or +/// llvm.coro.id.retcon.once instruction. +class LLVM_LIBRARY_VISIBILITY AnyCoroIdRetconInst : public AnyCoroIdInst { + enum { SizeArg, AlignArg, StorageArg, PrototypeArg, AllocArg, DeallocArg }; + +public: + void checkWellFormed() const; + + uint64_t getStorageSize() const { + return cast<ConstantInt>(getArgOperand(SizeArg))->getZExtValue(); + } + + uint64_t getStorageAlignment() const { + return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue(); + } + + Value *getStorage() const { + return getArgOperand(StorageArg); + } + + /// Return the prototype for the continuation function. The type, + /// attributes, and calling convention of the continuation function(s) + /// are taken from this declaration. + Function *getPrototype() const { + return cast<Function>(getArgOperand(PrototypeArg)->stripPointerCasts()); + } + + /// Return the function to use for allocating memory. + Function *getAllocFunction() const { + return cast<Function>(getArgOperand(AllocArg)->stripPointerCasts()); + } + + /// Return the function to use for deallocating memory. + Function *getDeallocFunction() const { + return cast<Function>(getArgOperand(DeallocArg)->stripPointerCasts()); + } + + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + auto ID = I->getIntrinsicID(); + return ID == Intrinsic::coro_id_retcon + || ID == Intrinsic::coro_id_retcon_once; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + +/// This represents the llvm.coro.id.retcon instruction. +class LLVM_LIBRARY_VISIBILITY CoroIdRetconInst + : public AnyCoroIdRetconInst { +public: + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_id_retcon; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + +/// This represents the llvm.coro.id.retcon.once instruction. +class LLVM_LIBRARY_VISIBILITY CoroIdRetconOnceInst + : public AnyCoroIdRetconInst { +public: + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_id_retcon_once; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + /// This represents the llvm.coro.frame instruction. class LLVM_LIBRARY_VISIBILITY CoroFrameInst : public IntrinsicInst { public: @@ -215,7 +306,9 @@ class LLVM_LIBRARY_VISIBILITY CoroBeginInst : public IntrinsicInst { enum { IdArg, MemArg }; public: - CoroIdInst *getId() const { return cast<CoroIdInst>(getArgOperand(IdArg)); } + AnyCoroIdInst *getId() const { + return cast<AnyCoroIdInst>(getArgOperand(IdArg)); + } Value *getMem() const { return getArgOperand(MemArg); } @@ -261,8 +354,22 @@ public: } }; +class LLVM_LIBRARY_VISIBILITY AnyCoroSuspendInst : public IntrinsicInst { +public: + CoroSaveInst *getCoroSave() const; + + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_suspend || + I->getIntrinsicID() == Intrinsic::coro_suspend_retcon; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + /// This represents the llvm.coro.suspend instruction. -class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public IntrinsicInst { +class LLVM_LIBRARY_VISIBILITY CoroSuspendInst : public AnyCoroSuspendInst { enum { SaveArg, FinalArg }; public: @@ -273,6 +380,7 @@ public: assert(isa<ConstantTokenNone>(Arg)); return nullptr; } + bool isFinal() const { return cast<Constant>(getArgOperand(FinalArg))->isOneValue(); } @@ -286,6 +394,37 @@ public: } }; +inline CoroSaveInst *AnyCoroSuspendInst::getCoroSave() const { + if (auto Suspend = dyn_cast<CoroSuspendInst>(this)) + return Suspend->getCoroSave(); + return nullptr; +} + +/// This represents the llvm.coro.suspend.retcon instruction. +class LLVM_LIBRARY_VISIBILITY CoroSuspendRetconInst : public AnyCoroSuspendInst { +public: + op_iterator value_begin() { return arg_begin(); } + const_op_iterator value_begin() const { return arg_begin(); } + + op_iterator value_end() { return arg_end(); } + const_op_iterator value_end() const { return arg_end(); } + + iterator_range<op_iterator> value_operands() { + return make_range(value_begin(), value_end()); + } + iterator_range<const_op_iterator> value_operands() const { + return make_range(value_begin(), value_end()); + } + + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_suspend_retcon; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + /// This represents the llvm.coro.size instruction. class LLVM_LIBRARY_VISIBILITY CoroSizeInst : public IntrinsicInst { public: @@ -317,6 +456,60 @@ public: } }; +/// This represents the llvm.coro.alloca.alloc instruction. +class LLVM_LIBRARY_VISIBILITY CoroAllocaAllocInst : public IntrinsicInst { + enum { SizeArg, AlignArg }; +public: + Value *getSize() const { + return getArgOperand(SizeArg); + } + unsigned getAlignment() const { + return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue(); + } + + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_alloca_alloc; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + +/// This represents the llvm.coro.alloca.get instruction. +class LLVM_LIBRARY_VISIBILITY CoroAllocaGetInst : public IntrinsicInst { + enum { AllocArg }; +public: + CoroAllocaAllocInst *getAlloc() const { + return cast<CoroAllocaAllocInst>(getArgOperand(AllocArg)); + } + + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_alloca_get; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + +/// This represents the llvm.coro.alloca.free instruction. +class LLVM_LIBRARY_VISIBILITY CoroAllocaFreeInst : public IntrinsicInst { + enum { AllocArg }; +public: + CoroAllocaAllocInst *getAlloc() const { + return cast<CoroAllocaAllocInst>(getArgOperand(AllocArg)); + } + + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_alloca_free; + } + static bool classof(const Value *V) { + return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); + } +}; + } // End namespace llvm. #endif diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h index 441c8a20f1f3..7eb35400c0d5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -12,6 +12,7 @@ #define LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H #include "CoroInstr.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/Transforms/Coroutines.h" namespace llvm { @@ -20,10 +21,10 @@ class CallGraph; class CallGraphSCC; class PassRegistry; -void initializeCoroEarlyPass(PassRegistry &); -void initializeCoroSplitPass(PassRegistry &); -void initializeCoroElidePass(PassRegistry &); -void initializeCoroCleanupPass(PassRegistry &); +void initializeCoroEarlyLegacyPass(PassRegistry &); +void initializeCoroSplitLegacyPass(PassRegistry &); +void initializeCoroElideLegacyPass(PassRegistry &); +void initializeCoroCleanupLegacyPass(PassRegistry &); // CoroEarly pass marks every function that has coro.begin with a string // attribute "coroutine.presplit"="0". CoroSplit pass processes the coroutine @@ -42,7 +43,8 @@ void initializeCoroCleanupPass(PassRegistry &); namespace coro { -bool declaresIntrinsics(Module &M, std::initializer_list<StringRef>); +bool declaresIntrinsics(const Module &M, + const std::initializer_list<StringRef>); void replaceAllCoroAllocs(CoroBeginInst *CB, bool Replacement); void replaceAllCoroFrees(CoroBeginInst *CB, Value *Replacement); void replaceCoroFree(CoroIdInst *CoroId, bool Elide); @@ -61,37 +63,174 @@ struct LowererBase { Value *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt); }; +enum class ABI { + /// The "resume-switch" lowering, where there are separate resume and + /// destroy functions that are shared between all suspend points. The + /// coroutine frame implicitly stores the resume and destroy functions, + /// the current index, and any promise value. + Switch, + + /// The "returned-continuation" lowering, where each suspend point creates a + /// single continuation function that is used for both resuming and + /// destroying. Does not support promises. + Retcon, + + /// The "unique returned-continuation" lowering, where each suspend point + /// creates a single continuation function that is used for both resuming + /// and destroying. Does not support promises. The function is known to + /// suspend at most once during its execution, and the return value of + /// the continuation is void. + RetconOnce, +}; + // Holds structural Coroutine Intrinsics for a particular function and other // values used during CoroSplit pass. struct LLVM_LIBRARY_VISIBILITY Shape { CoroBeginInst *CoroBegin; SmallVector<CoroEndInst *, 4> CoroEnds; SmallVector<CoroSizeInst *, 2> CoroSizes; - SmallVector<CoroSuspendInst *, 4> CoroSuspends; - - // Field Indexes for known coroutine frame fields. - enum { - ResumeField, - DestroyField, - PromiseField, - IndexField, + SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends; + SmallVector<CallInst*, 2> SwiftErrorOps; + + // Field indexes for special fields in the switch lowering. + struct SwitchFieldIndex { + enum { + Resume, + Destroy, + Promise, + Index, + /// The index of the first spill field. + FirstSpill + }; }; + coro::ABI ABI; + StructType *FrameTy; Instruction *FramePtr; BasicBlock *AllocaSpillBlock; - SwitchInst *ResumeSwitch; - AllocaInst *PromiseAlloca; - bool HasFinalSuspend; + + struct SwitchLoweringStorage { + SwitchInst *ResumeSwitch; + AllocaInst *PromiseAlloca; + BasicBlock *ResumeEntryBlock; + bool HasFinalSuspend; + }; + + struct RetconLoweringStorage { + Function *ResumePrototype; + Function *Alloc; + Function *Dealloc; + BasicBlock *ReturnBlock; + bool IsFrameInlineInStorage; + }; + + union { + SwitchLoweringStorage SwitchLowering; + RetconLoweringStorage RetconLowering; + }; + + CoroIdInst *getSwitchCoroId() const { + assert(ABI == coro::ABI::Switch); + return cast<CoroIdInst>(CoroBegin->getId()); + } + + AnyCoroIdRetconInst *getRetconCoroId() const { + assert(ABI == coro::ABI::Retcon || + ABI == coro::ABI::RetconOnce); + return cast<AnyCoroIdRetconInst>(CoroBegin->getId()); + } IntegerType *getIndexType() const { + assert(ABI == coro::ABI::Switch); assert(FrameTy && "frame type not assigned"); - return cast<IntegerType>(FrameTy->getElementType(IndexField)); + return cast<IntegerType>(FrameTy->getElementType(SwitchFieldIndex::Index)); } ConstantInt *getIndex(uint64_t Value) const { return ConstantInt::get(getIndexType(), Value); } + PointerType *getSwitchResumePointerType() const { + assert(ABI == coro::ABI::Switch); + assert(FrameTy && "frame type not assigned"); + return cast<PointerType>(FrameTy->getElementType(SwitchFieldIndex::Resume)); + } + + FunctionType *getResumeFunctionType() const { + switch (ABI) { + case coro::ABI::Switch: { + auto *FnPtrTy = getSwitchResumePointerType(); + return cast<FunctionType>(FnPtrTy->getPointerElementType()); + } + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: + return RetconLowering.ResumePrototype->getFunctionType(); + } + llvm_unreachable("Unknown coro::ABI enum"); + } + + ArrayRef<Type*> getRetconResultTypes() const { + assert(ABI == coro::ABI::Retcon || + ABI == coro::ABI::RetconOnce); + auto FTy = CoroBegin->getFunction()->getFunctionType(); + + // The safety of all this is checked by checkWFRetconPrototype. + if (auto STy = dyn_cast<StructType>(FTy->getReturnType())) { + return STy->elements().slice(1); + } else { + return ArrayRef<Type*>(); + } + } + + ArrayRef<Type*> getRetconResumeTypes() const { + assert(ABI == coro::ABI::Retcon || + ABI == coro::ABI::RetconOnce); + + // The safety of all this is checked by checkWFRetconPrototype. + auto FTy = RetconLowering.ResumePrototype->getFunctionType(); + return FTy->params().slice(1); + } + + CallingConv::ID getResumeFunctionCC() const { + switch (ABI) { + case coro::ABI::Switch: + return CallingConv::Fast; + + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: + return RetconLowering.ResumePrototype->getCallingConv(); + } + llvm_unreachable("Unknown coro::ABI enum"); + } + + unsigned getFirstSpillFieldIndex() const { + switch (ABI) { + case coro::ABI::Switch: + return SwitchFieldIndex::FirstSpill; + + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: + return 0; + } + llvm_unreachable("Unknown coro::ABI enum"); + } + + AllocaInst *getPromiseAlloca() const { + if (ABI == coro::ABI::Switch) + return SwitchLowering.PromiseAlloca; + return nullptr; + } + + /// Allocate memory according to the rules of the active lowering. + /// + /// \param CG - if non-null, will be updated for the new call + Value *emitAlloc(IRBuilder<> &Builder, Value *Size, CallGraph *CG) const; + + /// Deallocate memory according to the rules of the active lowering. + /// + /// \param CG - if non-null, will be updated for the new call + void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const; + Shape() = default; explicit Shape(Function &F) { buildFrom(F); } void buildFrom(Function &F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 5458e70ff16a..66cb3e74e53e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -27,7 +27,6 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -52,13 +51,16 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include <cassert> #include <cstddef> @@ -70,9 +72,198 @@ using namespace llvm; #define DEBUG_TYPE "coro-split" +namespace { + +/// A little helper class for building +class CoroCloner { +public: + enum class Kind { + /// The shared resume function for a switch lowering. + SwitchResume, + + /// The shared unwind function for a switch lowering. + SwitchUnwind, + + /// The shared cleanup function for a switch lowering. + SwitchCleanup, + + /// An individual continuation function. + Continuation, + }; +private: + Function &OrigF; + Function *NewF; + const Twine &Suffix; + coro::Shape &Shape; + Kind FKind; + ValueToValueMapTy VMap; + IRBuilder<> Builder; + Value *NewFramePtr = nullptr; + Value *SwiftErrorSlot = nullptr; + + /// The active suspend instruction; meaningful only for continuation ABIs. + AnyCoroSuspendInst *ActiveSuspend = nullptr; + +public: + /// Create a cloner for a switch lowering. + CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape, + Kind FKind) + : OrigF(OrigF), NewF(nullptr), Suffix(Suffix), Shape(Shape), + FKind(FKind), Builder(OrigF.getContext()) { + assert(Shape.ABI == coro::ABI::Switch); + } + + /// Create a cloner for a continuation lowering. + CoroCloner(Function &OrigF, const Twine &Suffix, coro::Shape &Shape, + Function *NewF, AnyCoroSuspendInst *ActiveSuspend) + : OrigF(OrigF), NewF(NewF), Suffix(Suffix), Shape(Shape), + FKind(Kind::Continuation), Builder(OrigF.getContext()), + ActiveSuspend(ActiveSuspend) { + assert(Shape.ABI == coro::ABI::Retcon || + Shape.ABI == coro::ABI::RetconOnce); + assert(NewF && "need existing function for continuation"); + assert(ActiveSuspend && "need active suspend point for continuation"); + } + + Function *getFunction() const { + assert(NewF != nullptr && "declaration not yet set"); + return NewF; + } + + void create(); + +private: + bool isSwitchDestroyFunction() { + switch (FKind) { + case Kind::Continuation: + case Kind::SwitchResume: + return false; + case Kind::SwitchUnwind: + case Kind::SwitchCleanup: + return true; + } + llvm_unreachable("Unknown CoroCloner::Kind enum"); + } + + void createDeclaration(); + void replaceEntryBlock(); + Value *deriveNewFramePointer(); + void replaceRetconSuspendUses(); + void replaceCoroSuspends(); + void replaceCoroEnds(); + void replaceSwiftErrorOps(); + void handleFinalSuspend(); + void maybeFreeContinuationStorage(); +}; + +} // end anonymous namespace + +static void maybeFreeRetconStorage(IRBuilder<> &Builder, + const coro::Shape &Shape, Value *FramePtr, + CallGraph *CG) { + assert(Shape.ABI == coro::ABI::Retcon || + Shape.ABI == coro::ABI::RetconOnce); + if (Shape.RetconLowering.IsFrameInlineInStorage) + return; + + Shape.emitDealloc(Builder, FramePtr, CG); +} + +/// Replace a non-unwind call to llvm.coro.end. +static void replaceFallthroughCoroEnd(CoroEndInst *End, + const coro::Shape &Shape, Value *FramePtr, + bool InResume, CallGraph *CG) { + // Start inserting right before the coro.end. + IRBuilder<> Builder(End); + + // Create the return instruction. + switch (Shape.ABI) { + // The cloned functions in switch-lowering always return void. + case coro::ABI::Switch: + // coro.end doesn't immediately end the coroutine in the main function + // in this lowering, because we need to deallocate the coroutine. + if (!InResume) + return; + Builder.CreateRetVoid(); + break; + + // In unique continuation lowering, the continuations always return void. + // But we may have implicitly allocated storage. + case coro::ABI::RetconOnce: + maybeFreeRetconStorage(Builder, Shape, FramePtr, CG); + Builder.CreateRetVoid(); + break; + + // In non-unique continuation lowering, we signal completion by returning + // a null continuation. + case coro::ABI::Retcon: { + maybeFreeRetconStorage(Builder, Shape, FramePtr, CG); + auto RetTy = Shape.getResumeFunctionType()->getReturnType(); + auto RetStructTy = dyn_cast<StructType>(RetTy); + PointerType *ContinuationTy = + cast<PointerType>(RetStructTy ? RetStructTy->getElementType(0) : RetTy); + + Value *ReturnValue = ConstantPointerNull::get(ContinuationTy); + if (RetStructTy) { + ReturnValue = Builder.CreateInsertValue(UndefValue::get(RetStructTy), + ReturnValue, 0); + } + Builder.CreateRet(ReturnValue); + break; + } + } + + // Remove the rest of the block, by splitting it into an unreachable block. + auto *BB = End->getParent(); + BB->splitBasicBlock(End); + BB->getTerminator()->eraseFromParent(); +} + +/// Replace an unwind call to llvm.coro.end. +static void replaceUnwindCoroEnd(CoroEndInst *End, const coro::Shape &Shape, + Value *FramePtr, bool InResume, CallGraph *CG){ + IRBuilder<> Builder(End); + + switch (Shape.ABI) { + // In switch-lowering, this does nothing in the main function. + case coro::ABI::Switch: + if (!InResume) + return; + break; + + // In continuation-lowering, this frees the continuation storage. + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: + maybeFreeRetconStorage(Builder, Shape, FramePtr, CG); + break; + } + + // If coro.end has an associated bundle, add cleanupret instruction. + if (auto Bundle = End->getOperandBundle(LLVMContext::OB_funclet)) { + auto *FromPad = cast<CleanupPadInst>(Bundle->Inputs[0]); + auto *CleanupRet = Builder.CreateCleanupRet(FromPad, nullptr); + End->getParent()->splitBasicBlock(End); + CleanupRet->getParent()->getTerminator()->eraseFromParent(); + } +} + +static void replaceCoroEnd(CoroEndInst *End, const coro::Shape &Shape, + Value *FramePtr, bool InResume, CallGraph *CG) { + if (End->isUnwind()) + replaceUnwindCoroEnd(End, Shape, FramePtr, InResume, CG); + else + replaceFallthroughCoroEnd(End, Shape, FramePtr, InResume, CG); + + auto &Context = End->getContext(); + End->replaceAllUsesWith(InResume ? ConstantInt::getTrue(Context) + : ConstantInt::getFalse(Context)); + End->eraseFromParent(); +} + // Create an entry block for a resume function with a switch that will jump to // suspend points. -static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) { +static void createResumeEntryBlock(Function &F, coro::Shape &Shape) { + assert(Shape.ABI == coro::ABI::Switch); LLVMContext &C = F.getContext(); // resume.entry: @@ -91,15 +282,16 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) { IRBuilder<> Builder(NewEntry); auto *FramePtr = Shape.FramePtr; auto *FrameTy = Shape.FrameTy; - auto *GepIndex = Builder.CreateConstInBoundsGEP2_32( - FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr"); + auto *GepIndex = Builder.CreateStructGEP( + FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr"); auto *Index = Builder.CreateLoad(Shape.getIndexType(), GepIndex, "index"); auto *Switch = Builder.CreateSwitch(Index, UnreachBB, Shape.CoroSuspends.size()); - Shape.ResumeSwitch = Switch; + Shape.SwitchLowering.ResumeSwitch = Switch; size_t SuspendIndex = 0; - for (CoroSuspendInst *S : Shape.CoroSuspends) { + for (auto *AnyS : Shape.CoroSuspends) { + auto *S = cast<CoroSuspendInst>(AnyS); ConstantInt *IndexVal = Shape.getIndex(SuspendIndex); // Replace CoroSave with a store to Index: @@ -109,14 +301,15 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) { Builder.SetInsertPoint(Save); if (S->isFinal()) { // Final suspend point is represented by storing zero in ResumeFnAddr. - auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(FrameTy, FramePtr, 0, - 0, "ResumeFn.addr"); + auto *GepIndex = Builder.CreateStructGEP(FrameTy, FramePtr, + coro::Shape::SwitchFieldIndex::Resume, + "ResumeFn.addr"); auto *NullPtr = ConstantPointerNull::get(cast<PointerType>( cast<PointerType>(GepIndex->getType())->getElementType())); Builder.CreateStore(NullPtr, GepIndex); } else { - auto *GepIndex = Builder.CreateConstInBoundsGEP2_32( - FrameTy, FramePtr, 0, coro::Shape::IndexField, "index.addr"); + auto *GepIndex = Builder.CreateStructGEP( + FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr"); Builder.CreateStore(IndexVal, GepIndex); } Save->replaceAllUsesWith(ConstantTokenNone::get(C)); @@ -164,48 +357,9 @@ static BasicBlock *createResumeEntryBlock(Function &F, coro::Shape &Shape) { Builder.SetInsertPoint(UnreachBB); Builder.CreateUnreachable(); - return NewEntry; -} - -// In Resumers, we replace fallthrough coro.end with ret void and delete the -// rest of the block. -static void replaceFallthroughCoroEnd(IntrinsicInst *End, - ValueToValueMapTy &VMap) { - auto *NewE = cast<IntrinsicInst>(VMap[End]); - ReturnInst::Create(NewE->getContext(), nullptr, NewE); - - // Remove the rest of the block, by splitting it into an unreachable block. - auto *BB = NewE->getParent(); - BB->splitBasicBlock(NewE); - BB->getTerminator()->eraseFromParent(); + Shape.SwitchLowering.ResumeEntryBlock = NewEntry; } -// In Resumers, we replace unwind coro.end with True to force the immediate -// unwind to caller. -static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) { - if (Shape.CoroEnds.empty()) - return; - - LLVMContext &Context = Shape.CoroEnds.front()->getContext(); - auto *True = ConstantInt::getTrue(Context); - for (CoroEndInst *CE : Shape.CoroEnds) { - if (!CE->isUnwind()) - continue; - - auto *NewCE = cast<IntrinsicInst>(VMap[CE]); - - // If coro.end has an associated bundle, add cleanupret instruction. - if (auto Bundle = NewCE->getOperandBundle(LLVMContext::OB_funclet)) { - Value *FromPad = Bundle->Inputs[0]; - auto *CleanupRet = CleanupReturnInst::Create(FromPad, nullptr, NewCE); - NewCE->getParent()->splitBasicBlock(NewCE); - CleanupRet->getParent()->getTerminator()->eraseFromParent(); - } - - NewCE->replaceAllUsesWith(True); - NewCE->eraseFromParent(); - } -} // Rewrite final suspend point handling. We do not use suspend index to // represent the final suspend point. Instead we zero-out ResumeFnAddr in the @@ -216,83 +370,364 @@ static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) { // In the destroy function, we add a code sequence to check if ResumeFnAddress // is Null, and if so, jump to the appropriate label to handle cleanup from the // final suspend point. -static void handleFinalSuspend(IRBuilder<> &Builder, Value *FramePtr, - coro::Shape &Shape, SwitchInst *Switch, - bool IsDestroy) { - assert(Shape.HasFinalSuspend); +void CoroCloner::handleFinalSuspend() { + assert(Shape.ABI == coro::ABI::Switch && + Shape.SwitchLowering.HasFinalSuspend); + auto *Switch = cast<SwitchInst>(VMap[Shape.SwitchLowering.ResumeSwitch]); auto FinalCaseIt = std::prev(Switch->case_end()); BasicBlock *ResumeBB = FinalCaseIt->getCaseSuccessor(); Switch->removeCase(FinalCaseIt); - if (IsDestroy) { + if (isSwitchDestroyFunction()) { BasicBlock *OldSwitchBB = Switch->getParent(); auto *NewSwitchBB = OldSwitchBB->splitBasicBlock(Switch, "Switch"); Builder.SetInsertPoint(OldSwitchBB->getTerminator()); - auto *GepIndex = Builder.CreateConstInBoundsGEP2_32(Shape.FrameTy, FramePtr, - 0, 0, "ResumeFn.addr"); - auto *Load = Builder.CreateLoad( - Shape.FrameTy->getElementType(coro::Shape::ResumeField), GepIndex); - auto *NullPtr = - ConstantPointerNull::get(cast<PointerType>(Load->getType())); - auto *Cond = Builder.CreateICmpEQ(Load, NullPtr); + auto *GepIndex = Builder.CreateStructGEP(Shape.FrameTy, NewFramePtr, + coro::Shape::SwitchFieldIndex::Resume, + "ResumeFn.addr"); + auto *Load = Builder.CreateLoad(Shape.getSwitchResumePointerType(), + GepIndex); + auto *Cond = Builder.CreateIsNull(Load); Builder.CreateCondBr(Cond, ResumeBB, NewSwitchBB); OldSwitchBB->getTerminator()->eraseFromParent(); } } -// Create a resume clone by cloning the body of the original function, setting -// new entry block and replacing coro.suspend an appropriate value to force -// resume or cleanup pass for every suspend point. -static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape, - BasicBlock *ResumeEntry, int8_t FnIndex) { - Module *M = F.getParent(); - auto *FrameTy = Shape.FrameTy; - auto *FnPtrTy = cast<PointerType>(FrameTy->getElementType(0)); - auto *FnTy = cast<FunctionType>(FnPtrTy->getElementType()); +static Function *createCloneDeclaration(Function &OrigF, coro::Shape &Shape, + const Twine &Suffix, + Module::iterator InsertBefore) { + Module *M = OrigF.getParent(); + auto *FnTy = Shape.getResumeFunctionType(); Function *NewF = - Function::Create(FnTy, GlobalValue::LinkageTypes::ExternalLinkage, - F.getName() + Suffix, M); + Function::Create(FnTy, GlobalValue::LinkageTypes::InternalLinkage, + OrigF.getName() + Suffix); NewF->addParamAttr(0, Attribute::NonNull); NewF->addParamAttr(0, Attribute::NoAlias); - ValueToValueMapTy VMap; + M->getFunctionList().insert(InsertBefore, NewF); + + return NewF; +} + +/// Replace uses of the active llvm.coro.suspend.retcon call with the +/// arguments to the continuation function. +/// +/// This assumes that the builder has a meaningful insertion point. +void CoroCloner::replaceRetconSuspendUses() { + assert(Shape.ABI == coro::ABI::Retcon || + Shape.ABI == coro::ABI::RetconOnce); + + auto NewS = VMap[ActiveSuspend]; + if (NewS->use_empty()) return; + + // Copy out all the continuation arguments after the buffer pointer into + // an easily-indexed data structure for convenience. + SmallVector<Value*, 8> Args; + for (auto I = std::next(NewF->arg_begin()), E = NewF->arg_end(); I != E; ++I) + Args.push_back(&*I); + + // If the suspend returns a single scalar value, we can just do a simple + // replacement. + if (!isa<StructType>(NewS->getType())) { + assert(Args.size() == 1); + NewS->replaceAllUsesWith(Args.front()); + return; + } + + // Try to peephole extracts of an aggregate return. + for (auto UI = NewS->use_begin(), UE = NewS->use_end(); UI != UE; ) { + auto EVI = dyn_cast<ExtractValueInst>((UI++)->getUser()); + if (!EVI || EVI->getNumIndices() != 1) + continue; + + EVI->replaceAllUsesWith(Args[EVI->getIndices().front()]); + EVI->eraseFromParent(); + } + + // If we have no remaining uses, we're done. + if (NewS->use_empty()) return; + + // Otherwise, we need to create an aggregate. + Value *Agg = UndefValue::get(NewS->getType()); + for (size_t I = 0, E = Args.size(); I != E; ++I) + Agg = Builder.CreateInsertValue(Agg, Args[I], I); + + NewS->replaceAllUsesWith(Agg); +} + +void CoroCloner::replaceCoroSuspends() { + Value *SuspendResult; + + switch (Shape.ABI) { + // In switch lowering, replace coro.suspend with the appropriate value + // for the type of function we're extracting. + // Replacing coro.suspend with (0) will result in control flow proceeding to + // a resume label associated with a suspend point, replacing it with (1) will + // result in control flow proceeding to a cleanup label associated with this + // suspend point. + case coro::ABI::Switch: + SuspendResult = Builder.getInt8(isSwitchDestroyFunction() ? 1 : 0); + break; + + // In returned-continuation lowering, the arguments from earlier + // continuations are theoretically arbitrary, and they should have been + // spilled. + case coro::ABI::RetconOnce: + case coro::ABI::Retcon: + return; + } + + for (AnyCoroSuspendInst *CS : Shape.CoroSuspends) { + // The active suspend was handled earlier. + if (CS == ActiveSuspend) continue; + + auto *MappedCS = cast<AnyCoroSuspendInst>(VMap[CS]); + MappedCS->replaceAllUsesWith(SuspendResult); + MappedCS->eraseFromParent(); + } +} + +void CoroCloner::replaceCoroEnds() { + for (CoroEndInst *CE : Shape.CoroEnds) { + // We use a null call graph because there's no call graph node for + // the cloned function yet. We'll just be rebuilding that later. + auto NewCE = cast<CoroEndInst>(VMap[CE]); + replaceCoroEnd(NewCE, Shape, NewFramePtr, /*in resume*/ true, nullptr); + } +} + +static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape, + ValueToValueMapTy *VMap) { + Value *CachedSlot = nullptr; + auto getSwiftErrorSlot = [&](Type *ValueTy) -> Value * { + if (CachedSlot) { + assert(CachedSlot->getType()->getPointerElementType() == ValueTy && + "multiple swifterror slots in function with different types"); + return CachedSlot; + } + + // Check if the function has a swifterror argument. + for (auto &Arg : F.args()) { + if (Arg.isSwiftError()) { + CachedSlot = &Arg; + assert(Arg.getType()->getPointerElementType() == ValueTy && + "swifterror argument does not have expected type"); + return &Arg; + } + } + + // Create a swifterror alloca. + IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg()); + auto Alloca = Builder.CreateAlloca(ValueTy); + Alloca->setSwiftError(true); + + CachedSlot = Alloca; + return Alloca; + }; + + for (CallInst *Op : Shape.SwiftErrorOps) { + auto MappedOp = VMap ? cast<CallInst>((*VMap)[Op]) : Op; + IRBuilder<> Builder(MappedOp); + + // If there are no arguments, this is a 'get' operation. + Value *MappedResult; + if (Op->getNumArgOperands() == 0) { + auto ValueTy = Op->getType(); + auto Slot = getSwiftErrorSlot(ValueTy); + MappedResult = Builder.CreateLoad(ValueTy, Slot); + } else { + assert(Op->getNumArgOperands() == 1); + auto Value = MappedOp->getArgOperand(0); + auto ValueTy = Value->getType(); + auto Slot = getSwiftErrorSlot(ValueTy); + Builder.CreateStore(Value, Slot); + MappedResult = Slot; + } + + MappedOp->replaceAllUsesWith(MappedResult); + MappedOp->eraseFromParent(); + } + + // If we're updating the original function, we've invalidated SwiftErrorOps. + if (VMap == nullptr) { + Shape.SwiftErrorOps.clear(); + } +} + +void CoroCloner::replaceSwiftErrorOps() { + ::replaceSwiftErrorOps(*NewF, Shape, &VMap); +} + +void CoroCloner::replaceEntryBlock() { + // In the original function, the AllocaSpillBlock is a block immediately + // following the allocation of the frame object which defines GEPs for + // all the allocas that have been moved into the frame, and it ends by + // branching to the original beginning of the coroutine. Make this + // the entry block of the cloned function. + auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]); + Entry->setName("entry" + Suffix); + Entry->moveBefore(&NewF->getEntryBlock()); + Entry->getTerminator()->eraseFromParent(); + + // Clear all predecessors of the new entry block. There should be + // exactly one predecessor, which we created when splitting out + // AllocaSpillBlock to begin with. + assert(Entry->hasOneUse()); + auto BranchToEntry = cast<BranchInst>(Entry->user_back()); + assert(BranchToEntry->isUnconditional()); + Builder.SetInsertPoint(BranchToEntry); + Builder.CreateUnreachable(); + BranchToEntry->eraseFromParent(); + + // TODO: move any allocas into Entry that weren't moved into the frame. + // (Currently we move all allocas into the frame.) + + // Branch from the entry to the appropriate place. + Builder.SetInsertPoint(Entry); + switch (Shape.ABI) { + case coro::ABI::Switch: { + // In switch-lowering, we built a resume-entry block in the original + // function. Make the entry block branch to this. + auto *SwitchBB = + cast<BasicBlock>(VMap[Shape.SwitchLowering.ResumeEntryBlock]); + Builder.CreateBr(SwitchBB); + break; + } + + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: { + // In continuation ABIs, we want to branch to immediately after the + // active suspend point. Earlier phases will have put the suspend in its + // own basic block, so just thread our jump directly to its successor. + auto MappedCS = cast<CoroSuspendRetconInst>(VMap[ActiveSuspend]); + auto Branch = cast<BranchInst>(MappedCS->getNextNode()); + assert(Branch->isUnconditional()); + Builder.CreateBr(Branch->getSuccessor(0)); + break; + } + } +} + +/// Derive the value of the new frame pointer. +Value *CoroCloner::deriveNewFramePointer() { + // Builder should be inserting to the front of the new entry block. + + switch (Shape.ABI) { + // In switch-lowering, the argument is the frame pointer. + case coro::ABI::Switch: + return &*NewF->arg_begin(); + + // In continuation-lowering, the argument is the opaque storage. + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: { + Argument *NewStorage = &*NewF->arg_begin(); + auto FramePtrTy = Shape.FrameTy->getPointerTo(); + + // If the storage is inline, just bitcast to the storage to the frame type. + if (Shape.RetconLowering.IsFrameInlineInStorage) + return Builder.CreateBitCast(NewStorage, FramePtrTy); + + // Otherwise, load the real frame from the opaque storage. + auto FramePtrPtr = + Builder.CreateBitCast(NewStorage, FramePtrTy->getPointerTo()); + return Builder.CreateLoad(FramePtrPtr); + } + } + llvm_unreachable("bad ABI"); +} + +/// Clone the body of the original function into a resume function of +/// some sort. +void CoroCloner::create() { + // Create the new function if we don't already have one. + if (!NewF) { + NewF = createCloneDeclaration(OrigF, Shape, Suffix, + OrigF.getParent()->end()); + } + // Replace all args with undefs. The buildCoroutineFrame algorithm already // rewritten access to the args that occurs after suspend points with loads // and stores to/from the coroutine frame. - for (Argument &A : F.args()) + for (Argument &A : OrigF.args()) VMap[&A] = UndefValue::get(A.getType()); SmallVector<ReturnInst *, 4> Returns; - CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns); - NewF->setLinkage(GlobalValue::LinkageTypes::InternalLinkage); + // Ignore attempts to change certain attributes of the function. + // TODO: maybe there should be a way to suppress this during cloning? + auto savedVisibility = NewF->getVisibility(); + auto savedUnnamedAddr = NewF->getUnnamedAddr(); + auto savedDLLStorageClass = NewF->getDLLStorageClass(); + + // NewF's linkage (which CloneFunctionInto does *not* change) might not + // be compatible with the visibility of OrigF (which it *does* change), + // so protect against that. + auto savedLinkage = NewF->getLinkage(); + NewF->setLinkage(llvm::GlobalValue::ExternalLinkage); + + CloneFunctionInto(NewF, &OrigF, VMap, /*ModuleLevelChanges=*/true, Returns); + + NewF->setLinkage(savedLinkage); + NewF->setVisibility(savedVisibility); + NewF->setUnnamedAddr(savedUnnamedAddr); + NewF->setDLLStorageClass(savedDLLStorageClass); + + auto &Context = NewF->getContext(); + + // Replace the attributes of the new function: + auto OrigAttrs = NewF->getAttributes(); + auto NewAttrs = AttributeList(); + + switch (Shape.ABI) { + case coro::ABI::Switch: + // Bootstrap attributes by copying function attributes from the + // original function. This should include optimization settings and so on. + NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex, + OrigAttrs.getFnAttributes()); + break; + + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: + // If we have a continuation prototype, just use its attributes, + // full-stop. + NewAttrs = Shape.RetconLowering.ResumePrototype->getAttributes(); + break; + } - // Remove old returns. - for (ReturnInst *Return : Returns) - changeToUnreachable(Return, /*UseLLVMTrap=*/false); + // Make the frame parameter nonnull and noalias. + NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NonNull); + NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NoAlias); + + switch (Shape.ABI) { + // In these ABIs, the cloned functions always return 'void', and the + // existing return sites are meaningless. Note that for unique + // continuations, this includes the returns associated with suspends; + // this is fine because we can't suspend twice. + case coro::ABI::Switch: + case coro::ABI::RetconOnce: + // Remove old returns. + for (ReturnInst *Return : Returns) + changeToUnreachable(Return, /*UseLLVMTrap=*/false); + break; + + // With multi-suspend continuations, we'll already have eliminated the + // original returns and inserted returns before all the suspend points, + // so we want to leave any returns in place. + case coro::ABI::Retcon: + break; + } - // Remove old return attributes. - NewF->removeAttributes( - AttributeList::ReturnIndex, - AttributeFuncs::typeIncompatible(NewF->getReturnType())); + NewF->setAttributes(NewAttrs); + NewF->setCallingConv(Shape.getResumeFunctionCC()); - // Make AllocaSpillBlock the new entry block. - auto *SwitchBB = cast<BasicBlock>(VMap[ResumeEntry]); - auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]); - Entry->moveBefore(&NewF->getEntryBlock()); - Entry->getTerminator()->eraseFromParent(); - BranchInst::Create(SwitchBB, Entry); - Entry->setName("entry" + Suffix); - - // Clear all predecessors of the new entry block. - auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]); - Entry->replaceAllUsesWith(Switch->getDefaultDest()); + // Set up the new entry block. + replaceEntryBlock(); - IRBuilder<> Builder(&NewF->getEntryBlock().front()); + Builder.SetInsertPoint(&NewF->getEntryBlock().front()); + NewFramePtr = deriveNewFramePointer(); // Remap frame pointer. - Argument *NewFramePtr = &*NewF->arg_begin(); - Value *OldFramePtr = cast<Value>(VMap[Shape.FramePtr]); + Value *OldFramePtr = VMap[Shape.FramePtr]; NewFramePtr->takeName(OldFramePtr); OldFramePtr->replaceAllUsesWith(NewFramePtr); @@ -302,50 +737,55 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape, Value *OldVFrame = cast<Value>(VMap[Shape.CoroBegin]); OldVFrame->replaceAllUsesWith(NewVFrame); - // Rewrite final suspend handling as it is not done via switch (allows to - // remove final case from the switch, since it is undefined behavior to resume - // the coroutine suspended at the final suspend point. - if (Shape.HasFinalSuspend) { - auto *Switch = cast<SwitchInst>(VMap[Shape.ResumeSwitch]); - bool IsDestroy = FnIndex != 0; - handleFinalSuspend(Builder, NewFramePtr, Shape, Switch, IsDestroy); + switch (Shape.ABI) { + case coro::ABI::Switch: + // Rewrite final suspend handling as it is not done via switch (allows to + // remove final case from the switch, since it is undefined behavior to + // resume the coroutine suspended at the final suspend point. + if (Shape.SwitchLowering.HasFinalSuspend) + handleFinalSuspend(); + break; + + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: + // Replace uses of the active suspend with the corresponding + // continuation-function arguments. + assert(ActiveSuspend != nullptr && + "no active suspend when lowering a continuation-style coroutine"); + replaceRetconSuspendUses(); + break; } - // Replace coro suspend with the appropriate resume index. - // Replacing coro.suspend with (0) will result in control flow proceeding to - // a resume label associated with a suspend point, replacing it with (1) will - // result in control flow proceeding to a cleanup label associated with this - // suspend point. - auto *NewValue = Builder.getInt8(FnIndex ? 1 : 0); - for (CoroSuspendInst *CS : Shape.CoroSuspends) { - auto *MappedCS = cast<CoroSuspendInst>(VMap[CS]); - MappedCS->replaceAllUsesWith(NewValue); - MappedCS->eraseFromParent(); - } + // Handle suspends. + replaceCoroSuspends(); + + // Handle swifterror. + replaceSwiftErrorOps(); // Remove coro.end intrinsics. - replaceFallthroughCoroEnd(Shape.CoroEnds.front(), VMap); - replaceUnwindCoroEnds(Shape, VMap); + replaceCoroEnds(); + // Eliminate coro.free from the clones, replacing it with 'null' in cleanup, // to suppress deallocation code. - coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]), - /*Elide=*/FnIndex == 2); - - NewF->setCallingConv(CallingConv::Fast); - - return NewF; + if (Shape.ABI == coro::ABI::Switch) + coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]), + /*Elide=*/ FKind == CoroCloner::Kind::SwitchCleanup); } -static void removeCoroEnds(coro::Shape &Shape) { - if (Shape.CoroEnds.empty()) - return; - - LLVMContext &Context = Shape.CoroEnds.front()->getContext(); - auto *False = ConstantInt::getFalse(Context); +// Create a resume clone by cloning the body of the original function, setting +// new entry block and replacing coro.suspend an appropriate value to force +// resume or cleanup pass for every suspend point. +static Function *createClone(Function &F, const Twine &Suffix, + coro::Shape &Shape, CoroCloner::Kind FKind) { + CoroCloner Cloner(F, Suffix, Shape, FKind); + Cloner.create(); + return Cloner.getFunction(); +} - for (CoroEndInst *CE : Shape.CoroEnds) { - CE->replaceAllUsesWith(False); - CE->eraseFromParent(); +/// Remove calls to llvm.coro.end in the original function. +static void removeCoroEnds(const coro::Shape &Shape, CallGraph *CG) { + for (auto End : Shape.CoroEnds) { + replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, CG); } } @@ -377,8 +817,12 @@ static void replaceFrameSize(coro::Shape &Shape) { // i8* bitcast([2 x void(%f.frame*)*] * @f.resumers to i8*)) // // Assumes that all the functions have the same signature. -static void setCoroInfo(Function &F, CoroBeginInst *CoroBegin, - std::initializer_list<Function *> Fns) { +static void setCoroInfo(Function &F, coro::Shape &Shape, + ArrayRef<Function *> Fns) { + // This only works under the switch-lowering ABI because coro elision + // only works on the switch-lowering ABI. + assert(Shape.ABI == coro::ABI::Switch); + SmallVector<Constant *, 4> Args(Fns.begin(), Fns.end()); assert(!Args.empty()); Function *Part = *Fns.begin(); @@ -393,38 +837,45 @@ static void setCoroInfo(Function &F, CoroBeginInst *CoroBegin, // Update coro.begin instruction to refer to this constant. LLVMContext &C = F.getContext(); auto *BC = ConstantExpr::getPointerCast(GV, Type::getInt8PtrTy(C)); - CoroBegin->getId()->setInfo(BC); + Shape.getSwitchCoroId()->setInfo(BC); } // Store addresses of Resume/Destroy/Cleanup functions in the coroutine frame. static void updateCoroFrame(coro::Shape &Shape, Function *ResumeFn, Function *DestroyFn, Function *CleanupFn) { + assert(Shape.ABI == coro::ABI::Switch); + IRBuilder<> Builder(Shape.FramePtr->getNextNode()); - auto *ResumeAddr = Builder.CreateConstInBoundsGEP2_32( - Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::ResumeField, + auto *ResumeAddr = Builder.CreateStructGEP( + Shape.FrameTy, Shape.FramePtr, coro::Shape::SwitchFieldIndex::Resume, "resume.addr"); Builder.CreateStore(ResumeFn, ResumeAddr); Value *DestroyOrCleanupFn = DestroyFn; - CoroIdInst *CoroId = Shape.CoroBegin->getId(); + CoroIdInst *CoroId = Shape.getSwitchCoroId(); if (CoroAllocInst *CA = CoroId->getCoroAlloc()) { // If there is a CoroAlloc and it returns false (meaning we elide the // allocation, use CleanupFn instead of DestroyFn). DestroyOrCleanupFn = Builder.CreateSelect(CA, DestroyFn, CleanupFn); } - auto *DestroyAddr = Builder.CreateConstInBoundsGEP2_32( - Shape.FrameTy, Shape.FramePtr, 0, coro::Shape::DestroyField, + auto *DestroyAddr = Builder.CreateStructGEP( + Shape.FrameTy, Shape.FramePtr, coro::Shape::SwitchFieldIndex::Destroy, "destroy.addr"); Builder.CreateStore(DestroyOrCleanupFn, DestroyAddr); } static void postSplitCleanup(Function &F) { removeUnreachableBlocks(F); + + // For now, we do a mandatory verification step because we don't + // entirely trust this pass. Note that we don't want to add a verifier + // pass to FPM below because it will also verify all the global data. + verifyFunction(F); + legacy::FunctionPassManager FPM(F.getParent()); - FPM.add(createVerifierPass()); FPM.add(createSCCPPass()); FPM.add(createCFGSimplificationPass()); FPM.add(createEarlyCSEPass()); @@ -457,17 +908,29 @@ scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock, // values and select the correct case successor when possible. static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) { DenseMap<Value *, Value *> ResolvedValues; + BasicBlock *UnconditionalSucc = nullptr; Instruction *I = InitialInst; while (I->isTerminator()) { if (isa<ReturnInst>(I)) { - if (I != InitialInst) + if (I != InitialInst) { + // If InitialInst is an unconditional branch, + // remove PHI values that come from basic block of InitialInst + if (UnconditionalSucc) + for (PHINode &PN : UnconditionalSucc->phis()) { + int idx = PN.getBasicBlockIndex(InitialInst->getParent()); + if (idx != -1) + PN.removeIncomingValue(idx); + } ReplaceInstWithInst(InitialInst, I->clone()); + } return true; } if (auto *BR = dyn_cast<BranchInst>(I)) { if (BR->isUnconditional()) { BasicBlock *BB = BR->getSuccessor(0); + if (I == InitialInst) + UnconditionalSucc = BB; scanPHIsAndUpdateValueMap(I, BB, ResolvedValues); I = BB->getFirstNonPHIOrDbgOrLifetime(); continue; @@ -520,21 +983,34 @@ static void addMustTailToCoroResumes(Function &F) { // Coroutine has no suspend points. Remove heap allocation for the coroutine // frame if possible. -static void handleNoSuspendCoroutine(CoroBeginInst *CoroBegin, Type *FrameTy) { +static void handleNoSuspendCoroutine(coro::Shape &Shape) { + auto *CoroBegin = Shape.CoroBegin; auto *CoroId = CoroBegin->getId(); auto *AllocInst = CoroId->getCoroAlloc(); - coro::replaceCoroFree(CoroId, /*Elide=*/AllocInst != nullptr); - if (AllocInst) { - IRBuilder<> Builder(AllocInst); - // FIXME: Need to handle overaligned members. - auto *Frame = Builder.CreateAlloca(FrameTy); - auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy()); - AllocInst->replaceAllUsesWith(Builder.getFalse()); - AllocInst->eraseFromParent(); - CoroBegin->replaceAllUsesWith(VFrame); - } else { - CoroBegin->replaceAllUsesWith(CoroBegin->getMem()); + switch (Shape.ABI) { + case coro::ABI::Switch: { + auto SwitchId = cast<CoroIdInst>(CoroId); + coro::replaceCoroFree(SwitchId, /*Elide=*/AllocInst != nullptr); + if (AllocInst) { + IRBuilder<> Builder(AllocInst); + // FIXME: Need to handle overaligned members. + auto *Frame = Builder.CreateAlloca(Shape.FrameTy); + auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy()); + AllocInst->replaceAllUsesWith(Builder.getFalse()); + AllocInst->eraseFromParent(); + CoroBegin->replaceAllUsesWith(VFrame); + } else { + CoroBegin->replaceAllUsesWith(CoroBegin->getMem()); + } + break; } + + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: + CoroBegin->replaceAllUsesWith(UndefValue::get(CoroBegin->getType())); + break; + } + CoroBegin->eraseFromParent(); } @@ -670,12 +1146,16 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend, // Remove suspend points that are simplified. static void simplifySuspendPoints(coro::Shape &Shape) { + // Currently, the only simplification we do is switch-lowering-specific. + if (Shape.ABI != coro::ABI::Switch) + return; + auto &S = Shape.CoroSuspends; size_t I = 0, N = S.size(); if (N == 0) return; while (true) { - if (simplifySuspendPoint(S[I], Shape.CoroBegin)) { + if (simplifySuspendPoint(cast<CoroSuspendInst>(S[I]), Shape.CoroBegin)) { if (--N == I) break; std::swap(S[I], S[N]); @@ -687,142 +1167,227 @@ static void simplifySuspendPoints(coro::Shape &Shape) { S.resize(N); } -static SmallPtrSet<BasicBlock *, 4> getCoroBeginPredBlocks(CoroBeginInst *CB) { - // Collect all blocks that we need to look for instructions to relocate. - SmallPtrSet<BasicBlock *, 4> RelocBlocks; - SmallVector<BasicBlock *, 4> Work; - Work.push_back(CB->getParent()); +static void splitSwitchCoroutine(Function &F, coro::Shape &Shape, + SmallVectorImpl<Function *> &Clones) { + assert(Shape.ABI == coro::ABI::Switch); - do { - BasicBlock *Current = Work.pop_back_val(); - for (BasicBlock *BB : predecessors(Current)) - if (RelocBlocks.count(BB) == 0) { - RelocBlocks.insert(BB); - Work.push_back(BB); - } - } while (!Work.empty()); - return RelocBlocks; -} - -static SmallPtrSet<Instruction *, 8> -getNotRelocatableInstructions(CoroBeginInst *CoroBegin, - SmallPtrSetImpl<BasicBlock *> &RelocBlocks) { - SmallPtrSet<Instruction *, 8> DoNotRelocate; - // Collect all instructions that we should not relocate - SmallVector<Instruction *, 8> Work; - - // Start with CoroBegin and terminators of all preceding blocks. - Work.push_back(CoroBegin); - BasicBlock *CoroBeginBB = CoroBegin->getParent(); - for (BasicBlock *BB : RelocBlocks) - if (BB != CoroBeginBB) - Work.push_back(BB->getTerminator()); - - // For every instruction in the Work list, place its operands in DoNotRelocate - // set. - do { - Instruction *Current = Work.pop_back_val(); - LLVM_DEBUG(dbgs() << "CoroSplit: Will not relocate: " << *Current << "\n"); - DoNotRelocate.insert(Current); - for (Value *U : Current->operands()) { - auto *I = dyn_cast<Instruction>(U); - if (!I) - continue; + createResumeEntryBlock(F, Shape); + auto ResumeClone = createClone(F, ".resume", Shape, + CoroCloner::Kind::SwitchResume); + auto DestroyClone = createClone(F, ".destroy", Shape, + CoroCloner::Kind::SwitchUnwind); + auto CleanupClone = createClone(F, ".cleanup", Shape, + CoroCloner::Kind::SwitchCleanup); - if (auto *A = dyn_cast<AllocaInst>(I)) { - // Stores to alloca instructions that occur before the coroutine frame - // is allocated should not be moved; the stored values may be used by - // the coroutine frame allocator. The operands to those stores must also - // remain in place. - for (const auto &User : A->users()) - if (auto *SI = dyn_cast<llvm::StoreInst>(User)) - if (RelocBlocks.count(SI->getParent()) != 0 && - DoNotRelocate.count(SI) == 0) { - Work.push_back(SI); - DoNotRelocate.insert(SI); - } - continue; - } + postSplitCleanup(*ResumeClone); + postSplitCleanup(*DestroyClone); + postSplitCleanup(*CleanupClone); + + addMustTailToCoroResumes(*ResumeClone); + + // Store addresses resume/destroy/cleanup functions in the coroutine frame. + updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone); + + assert(Clones.empty()); + Clones.push_back(ResumeClone); + Clones.push_back(DestroyClone); + Clones.push_back(CleanupClone); + + // Create a constant array referring to resume/destroy/clone functions pointed + // by the last argument of @llvm.coro.info, so that CoroElide pass can + // determined correct function to call. + setCoroInfo(F, Shape, Clones); +} + +static void splitRetconCoroutine(Function &F, coro::Shape &Shape, + SmallVectorImpl<Function *> &Clones) { + assert(Shape.ABI == coro::ABI::Retcon || + Shape.ABI == coro::ABI::RetconOnce); + assert(Clones.empty()); + + // Reset various things that the optimizer might have decided it + // "knows" about the coroutine function due to not seeing a return. + F.removeFnAttr(Attribute::NoReturn); + F.removeAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); + F.removeAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + + // Allocate the frame. + auto *Id = cast<AnyCoroIdRetconInst>(Shape.CoroBegin->getId()); + Value *RawFramePtr; + if (Shape.RetconLowering.IsFrameInlineInStorage) { + RawFramePtr = Id->getStorage(); + } else { + IRBuilder<> Builder(Id); + + // Determine the size of the frame. + const DataLayout &DL = F.getParent()->getDataLayout(); + auto Size = DL.getTypeAllocSize(Shape.FrameTy); + + // Allocate. We don't need to update the call graph node because we're + // going to recompute it from scratch after splitting. + RawFramePtr = Shape.emitAlloc(Builder, Builder.getInt64(Size), nullptr); + RawFramePtr = + Builder.CreateBitCast(RawFramePtr, Shape.CoroBegin->getType()); + + // Stash the allocated frame pointer in the continuation storage. + auto Dest = Builder.CreateBitCast(Id->getStorage(), + RawFramePtr->getType()->getPointerTo()); + Builder.CreateStore(RawFramePtr, Dest); + } - if (DoNotRelocate.count(I) == 0) { - Work.push_back(I); - DoNotRelocate.insert(I); + // Map all uses of llvm.coro.begin to the allocated frame pointer. + { + // Make sure we don't invalidate Shape.FramePtr. + TrackingVH<Instruction> Handle(Shape.FramePtr); + Shape.CoroBegin->replaceAllUsesWith(RawFramePtr); + Shape.FramePtr = Handle.getValPtr(); + } + + // Create a unique return block. + BasicBlock *ReturnBB = nullptr; + SmallVector<PHINode *, 4> ReturnPHIs; + + // Create all the functions in order after the main function. + auto NextF = std::next(F.getIterator()); + + // Create a continuation function for each of the suspend points. + Clones.reserve(Shape.CoroSuspends.size()); + for (size_t i = 0, e = Shape.CoroSuspends.size(); i != e; ++i) { + auto Suspend = cast<CoroSuspendRetconInst>(Shape.CoroSuspends[i]); + + // Create the clone declaration. + auto Continuation = + createCloneDeclaration(F, Shape, ".resume." + Twine(i), NextF); + Clones.push_back(Continuation); + + // Insert a branch to the unified return block immediately before + // the suspend point. + auto SuspendBB = Suspend->getParent(); + auto NewSuspendBB = SuspendBB->splitBasicBlock(Suspend); + auto Branch = cast<BranchInst>(SuspendBB->getTerminator()); + + // Create the unified return block. + if (!ReturnBB) { + // Place it before the first suspend. + ReturnBB = BasicBlock::Create(F.getContext(), "coro.return", &F, + NewSuspendBB); + Shape.RetconLowering.ReturnBlock = ReturnBB; + + IRBuilder<> Builder(ReturnBB); + + // Create PHIs for all the return values. + assert(ReturnPHIs.empty()); + + // First, the continuation. + ReturnPHIs.push_back(Builder.CreatePHI(Continuation->getType(), + Shape.CoroSuspends.size())); + + // Next, all the directly-yielded values. + for (auto ResultTy : Shape.getRetconResultTypes()) + ReturnPHIs.push_back(Builder.CreatePHI(ResultTy, + Shape.CoroSuspends.size())); + + // Build the return value. + auto RetTy = F.getReturnType(); + + // Cast the continuation value if necessary. + // We can't rely on the types matching up because that type would + // have to be infinite. + auto CastedContinuationTy = + (ReturnPHIs.size() == 1 ? RetTy : RetTy->getStructElementType(0)); + auto *CastedContinuation = + Builder.CreateBitCast(ReturnPHIs[0], CastedContinuationTy); + + Value *RetV; + if (ReturnPHIs.size() == 1) { + RetV = CastedContinuation; + } else { + RetV = UndefValue::get(RetTy); + RetV = Builder.CreateInsertValue(RetV, CastedContinuation, 0); + for (size_t I = 1, E = ReturnPHIs.size(); I != E; ++I) + RetV = Builder.CreateInsertValue(RetV, ReturnPHIs[I], I); } + + Builder.CreateRet(RetV); } - } while (!Work.empty()); - return DoNotRelocate; -} -static void relocateInstructionBefore(CoroBeginInst *CoroBegin, Function &F) { - // Analyze which non-alloca instructions are needed for allocation and - // relocate the rest to after coro.begin. We need to do it, since some of the - // targets of those instructions may be placed into coroutine frame memory - // for which becomes available after coro.begin intrinsic. + // Branch to the return block. + Branch->setSuccessor(0, ReturnBB); + ReturnPHIs[0]->addIncoming(Continuation, SuspendBB); + size_t NextPHIIndex = 1; + for (auto &VUse : Suspend->value_operands()) + ReturnPHIs[NextPHIIndex++]->addIncoming(&*VUse, SuspendBB); + assert(NextPHIIndex == ReturnPHIs.size()); + } - auto BlockSet = getCoroBeginPredBlocks(CoroBegin); - auto DoNotRelocateSet = getNotRelocatableInstructions(CoroBegin, BlockSet); + assert(Clones.size() == Shape.CoroSuspends.size()); + for (size_t i = 0, e = Shape.CoroSuspends.size(); i != e; ++i) { + auto Suspend = Shape.CoroSuspends[i]; + auto Clone = Clones[i]; - Instruction *InsertPt = CoroBegin->getNextNode(); - BasicBlock &BB = F.getEntryBlock(); // TODO: Look at other blocks as well. - for (auto B = BB.begin(), E = BB.end(); B != E;) { - Instruction &I = *B++; - if (isa<AllocaInst>(&I)) - continue; - if (&I == CoroBegin) - break; - if (DoNotRelocateSet.count(&I)) - continue; - I.moveBefore(InsertPt); + CoroCloner(F, "resume." + Twine(i), Shape, Clone, Suspend).create(); + } +} + +namespace { + class PrettyStackTraceFunction : public PrettyStackTraceEntry { + Function &F; + public: + PrettyStackTraceFunction(Function &F) : F(F) {} + void print(raw_ostream &OS) const override { + OS << "While splitting coroutine "; + F.printAsOperand(OS, /*print type*/ false, F.getParent()); + OS << "\n"; + } + }; +} + +static void splitCoroutine(Function &F, coro::Shape &Shape, + SmallVectorImpl<Function *> &Clones) { + switch (Shape.ABI) { + case coro::ABI::Switch: + return splitSwitchCoroutine(F, Shape, Clones); + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: + return splitRetconCoroutine(F, Shape, Clones); } + llvm_unreachable("bad ABI kind"); } static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) { - EliminateUnreachableBlocks(F); + PrettyStackTraceFunction prettyStackTrace(F); + + // The suspend-crossing algorithm in buildCoroutineFrame get tripped + // up by uses in unreachable blocks, so remove them as a first pass. + removeUnreachableBlocks(F); coro::Shape Shape(F); if (!Shape.CoroBegin) return; simplifySuspendPoints(Shape); - relocateInstructionBefore(Shape.CoroBegin, F); buildCoroutineFrame(F, Shape); replaceFrameSize(Shape); + SmallVector<Function*, 4> Clones; + // If there are no suspend points, no split required, just remove // the allocation and deallocation blocks, they are not needed. if (Shape.CoroSuspends.empty()) { - handleNoSuspendCoroutine(Shape.CoroBegin, Shape.FrameTy); - removeCoroEnds(Shape); - postSplitCleanup(F); - coro::updateCallGraph(F, {}, CG, SCC); - return; + handleNoSuspendCoroutine(Shape); + } else { + splitCoroutine(F, Shape, Clones); } - auto *ResumeEntry = createResumeEntryBlock(F, Shape); - auto ResumeClone = createClone(F, ".resume", Shape, ResumeEntry, 0); - auto DestroyClone = createClone(F, ".destroy", Shape, ResumeEntry, 1); - auto CleanupClone = createClone(F, ".cleanup", Shape, ResumeEntry, 2); - - // We no longer need coro.end in F. - removeCoroEnds(Shape); + // Replace all the swifterror operations in the original function. + // This invalidates SwiftErrorOps in the Shape. + replaceSwiftErrorOps(F, Shape, nullptr); + removeCoroEnds(Shape, &CG); postSplitCleanup(F); - postSplitCleanup(*ResumeClone); - postSplitCleanup(*DestroyClone); - postSplitCleanup(*CleanupClone); - - addMustTailToCoroResumes(*ResumeClone); - - // Store addresses resume/destroy/cleanup functions in the coroutine frame. - updateCoroFrame(Shape, ResumeClone, DestroyClone, CleanupClone); - - // Create a constant array referring to resume/destroy/clone functions pointed - // by the last argument of @llvm.coro.info, so that CoroElide pass can - // determined correct function to call. - setCoroInfo(F, Shape.CoroBegin, {ResumeClone, DestroyClone, CleanupClone}); // Update call graph and add the functions we created to the SCC. - coro::updateCallGraph(F, {ResumeClone, DestroyClone, CleanupClone}, CG, SCC); + coro::updateCallGraph(F, Clones, CG, SCC); } // When we see the coroutine the first time, we insert an indirect call to a @@ -856,9 +1421,10 @@ static void prepareForSplit(Function &F, CallGraph &CG) { CG[&F]->addCalledFunction(IndirectCall, CG.getCallsExternalNode()); } -// Make sure that there is a devirtualization trigger function that CoroSplit -// pass uses the force restart CGSCC pipeline. If devirt trigger function is not -// found, we will create one and add it to the current SCC. +// Make sure that there is a devirtualization trigger function that the +// coro-split pass uses to force a restart of the CGSCC pipeline. If the devirt +// trigger function is not found, we will create one and add it to the current +// SCC. static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) { Module &M = CG.getModule(); if (M.getFunction(CORO_DEVIRT_TRIGGER_FN)) @@ -881,17 +1447,91 @@ static void createDevirtTriggerFunc(CallGraph &CG, CallGraphSCC &SCC) { SCC.initialize(Nodes); } +/// Replace a call to llvm.coro.prepare.retcon. +static void replacePrepare(CallInst *Prepare, CallGraph &CG) { + auto CastFn = Prepare->getArgOperand(0); // as an i8* + auto Fn = CastFn->stripPointerCasts(); // as its original type + + // Find call graph nodes for the preparation. + CallGraphNode *PrepareUserNode = nullptr, *FnNode = nullptr; + if (auto ConcreteFn = dyn_cast<Function>(Fn)) { + PrepareUserNode = CG[Prepare->getFunction()]; + FnNode = CG[ConcreteFn]; + } + + // Attempt to peephole this pattern: + // %0 = bitcast [[TYPE]] @some_function to i8* + // %1 = call @llvm.coro.prepare.retcon(i8* %0) + // %2 = bitcast %1 to [[TYPE]] + // ==> + // %2 = @some_function + for (auto UI = Prepare->use_begin(), UE = Prepare->use_end(); + UI != UE; ) { + // Look for bitcasts back to the original function type. + auto *Cast = dyn_cast<BitCastInst>((UI++)->getUser()); + if (!Cast || Cast->getType() != Fn->getType()) continue; + + // Check whether the replacement will introduce new direct calls. + // If so, we'll need to update the call graph. + if (PrepareUserNode) { + for (auto &Use : Cast->uses()) { + if (auto *CB = dyn_cast<CallBase>(Use.getUser())) { + if (!CB->isCallee(&Use)) + continue; + PrepareUserNode->removeCallEdgeFor(*CB); + PrepareUserNode->addCalledFunction(CB, FnNode); + } + } + } + + // Replace and remove the cast. + Cast->replaceAllUsesWith(Fn); + Cast->eraseFromParent(); + } + + // Replace any remaining uses with the function as an i8*. + // This can never directly be a callee, so we don't need to update CG. + Prepare->replaceAllUsesWith(CastFn); + Prepare->eraseFromParent(); + + // Kill dead bitcasts. + while (auto *Cast = dyn_cast<BitCastInst>(CastFn)) { + if (!Cast->use_empty()) break; + CastFn = Cast->getOperand(0); + Cast->eraseFromParent(); + } +} + +/// Remove calls to llvm.coro.prepare.retcon, a barrier meant to prevent +/// IPO from operating on calls to a retcon coroutine before it's been +/// split. This is only safe to do after we've split all retcon +/// coroutines in the module. We can do that this in this pass because +/// this pass does promise to split all retcon coroutines (as opposed to +/// switch coroutines, which are lowered in multiple stages). +static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) { + bool Changed = false; + for (auto PI = PrepareFn->use_begin(), PE = PrepareFn->use_end(); + PI != PE; ) { + // Intrinsics can only be used in calls. + auto *Prepare = cast<CallInst>((PI++)->getUser()); + replacePrepare(Prepare, CG); + Changed = true; + } + + return Changed; +} + //===----------------------------------------------------------------------===// // Top Level Driver //===----------------------------------------------------------------------===// namespace { -struct CoroSplit : public CallGraphSCCPass { +struct CoroSplitLegacy : public CallGraphSCCPass { static char ID; // Pass identification, replacement for typeid - CoroSplit() : CallGraphSCCPass(ID) { - initializeCoroSplitPass(*PassRegistry::getPassRegistry()); + CoroSplitLegacy() : CallGraphSCCPass(ID) { + initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry()); } bool Run = false; @@ -899,7 +1539,9 @@ struct CoroSplit : public CallGraphSCCPass { // A coroutine is identified by the presence of coro.begin intrinsic, if // we don't have any, this pass has nothing to do. bool doInitialization(CallGraph &CG) override { - Run = coro::declaresIntrinsics(CG.getModule(), {"llvm.coro.begin"}); + Run = coro::declaresIntrinsics(CG.getModule(), + {"llvm.coro.begin", + "llvm.coro.prepare.retcon"}); return CallGraphSCCPass::doInitialization(CG); } @@ -907,6 +1549,12 @@ struct CoroSplit : public CallGraphSCCPass { if (!Run) return false; + // Check for uses of llvm.coro.prepare.retcon. + auto PrepareFn = + SCC.getCallGraph().getModule().getFunction("llvm.coro.prepare.retcon"); + if (PrepareFn && PrepareFn->use_empty()) + PrepareFn = nullptr; + // Find coroutines for processing. SmallVector<Function *, 4> Coroutines; for (CallGraphNode *CGN : SCC) @@ -914,12 +1562,17 @@ struct CoroSplit : public CallGraphSCCPass { if (F->hasFnAttribute(CORO_PRESPLIT_ATTR)) Coroutines.push_back(F); - if (Coroutines.empty()) + if (Coroutines.empty() && !PrepareFn) return false; CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); + + if (Coroutines.empty()) + return replaceAllPrepares(PrepareFn, CG); + createDevirtTriggerFunc(CG, SCC); + // Split all the coroutines. for (Function *F : Coroutines) { Attribute Attr = F->getFnAttribute(CORO_PRESPLIT_ATTR); StringRef Value = Attr.getValueAsString(); @@ -932,6 +1585,10 @@ struct CoroSplit : public CallGraphSCCPass { F->removeFnAttr(CORO_PRESPLIT_ATTR); splitCoroutine(*F, CG, SCC); } + + if (PrepareFn) + replaceAllPrepares(PrepareFn, CG); + return true; } @@ -944,16 +1601,16 @@ struct CoroSplit : public CallGraphSCCPass { } // end anonymous namespace -char CoroSplit::ID = 0; +char CoroSplitLegacy::ID = 0; INITIALIZE_PASS_BEGIN( - CoroSplit, "coro-split", + CoroSplitLegacy, "coro-split", "Split coroutine into a set of functions driving its state machine", false, false) INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) INITIALIZE_PASS_END( - CoroSplit, "coro-split", + CoroSplitLegacy, "coro-split", "Split coroutine into a set of functions driving its state machine", false, false) -Pass *llvm::createCoroSplitPass() { return new CoroSplit(); } +Pass *llvm::createCoroSplitLegacyPass() { return new CoroSplitLegacy(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp index a581d1d21169..02d11af3303f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -11,14 +11,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Coroutines.h" -#include "llvm-c/Transforms/Coroutines.h" #include "CoroInstr.h" #include "CoroInternal.h" +#include "llvm-c/Transforms/Coroutines.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -31,10 +30,12 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstddef> #include <utility> @@ -42,39 +43,39 @@ using namespace llvm; void llvm::initializeCoroutines(PassRegistry &Registry) { - initializeCoroEarlyPass(Registry); - initializeCoroSplitPass(Registry); - initializeCoroElidePass(Registry); - initializeCoroCleanupPass(Registry); + initializeCoroEarlyLegacyPass(Registry); + initializeCoroSplitLegacyPass(Registry); + initializeCoroElideLegacyPass(Registry); + initializeCoroCleanupLegacyPass(Registry); } static void addCoroutineOpt0Passes(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroSplitPass()); - PM.add(createCoroElidePass()); + PM.add(createCoroSplitLegacyPass()); + PM.add(createCoroElideLegacyPass()); PM.add(createBarrierNoopPass()); - PM.add(createCoroCleanupPass()); + PM.add(createCoroCleanupLegacyPass()); } static void addCoroutineEarlyPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroEarlyPass()); + PM.add(createCoroEarlyLegacyPass()); } static void addCoroutineScalarOptimizerPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroElidePass()); + PM.add(createCoroElideLegacyPass()); } static void addCoroutineSCCPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroSplitPass()); + PM.add(createCoroSplitLegacyPass()); } static void addCoroutineOptimizerLastPasses(const PassManagerBuilder &Builder, legacy::PassManagerBase &PM) { - PM.add(createCoroCleanupPass()); + PM.add(createCoroCleanupLegacyPass()); } void llvm::addCoroutinePassesToExtensionPoints(PassManagerBuilder &Builder) { @@ -123,12 +124,26 @@ Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index, static bool isCoroutineIntrinsicName(StringRef Name) { // NOTE: Must be sorted! static const char *const CoroIntrinsics[] = { - "llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.destroy", - "llvm.coro.done", "llvm.coro.end", "llvm.coro.frame", - "llvm.coro.free", "llvm.coro.id", "llvm.coro.noop", - "llvm.coro.param", "llvm.coro.promise", "llvm.coro.resume", - "llvm.coro.save", "llvm.coro.size", "llvm.coro.subfn.addr", + "llvm.coro.alloc", + "llvm.coro.begin", + "llvm.coro.destroy", + "llvm.coro.done", + "llvm.coro.end", + "llvm.coro.frame", + "llvm.coro.free", + "llvm.coro.id", + "llvm.coro.id.retcon", + "llvm.coro.id.retcon.once", + "llvm.coro.noop", + "llvm.coro.param", + "llvm.coro.prepare.retcon", + "llvm.coro.promise", + "llvm.coro.resume", + "llvm.coro.save", + "llvm.coro.size", + "llvm.coro.subfn.addr", "llvm.coro.suspend", + "llvm.coro.suspend.retcon", }; return Intrinsic::lookupLLVMIntrinsicByName(CoroIntrinsics, Name) != -1; } @@ -136,8 +151,8 @@ static bool isCoroutineIntrinsicName(StringRef Name) { // Verifies if a module has named values listed. Also, in debug mode verifies // that names are intrinsic names. -bool coro::declaresIntrinsics(Module &M, - std::initializer_list<StringRef> List) { +bool coro::declaresIntrinsics(const Module &M, + const std::initializer_list<StringRef> List) { for (StringRef Name : List) { assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic"); if (M.getNamedValue(Name)) @@ -217,9 +232,6 @@ static void clear(coro::Shape &Shape) { Shape.FrameTy = nullptr; Shape.FramePtr = nullptr; Shape.AllocaSpillBlock = nullptr; - Shape.ResumeSwitch = nullptr; - Shape.PromiseAlloca = nullptr; - Shape.HasFinalSuspend = false; } static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin, @@ -235,6 +247,7 @@ static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin, // Collect "interesting" coroutine intrinsics. void coro::Shape::buildFrom(Function &F) { + bool HasFinalSuspend = false; size_t FinalSuspendIndex = 0; clear(*this); SmallVector<CoroFrameInst *, 8> CoroFrames; @@ -257,9 +270,15 @@ void coro::Shape::buildFrom(Function &F) { if (II->use_empty()) UnusedCoroSaves.push_back(cast<CoroSaveInst>(II)); break; - case Intrinsic::coro_suspend: - CoroSuspends.push_back(cast<CoroSuspendInst>(II)); - if (CoroSuspends.back()->isFinal()) { + case Intrinsic::coro_suspend_retcon: { + auto Suspend = cast<CoroSuspendRetconInst>(II); + CoroSuspends.push_back(Suspend); + break; + } + case Intrinsic::coro_suspend: { + auto Suspend = cast<CoroSuspendInst>(II); + CoroSuspends.push_back(Suspend); + if (Suspend->isFinal()) { if (HasFinalSuspend) report_fatal_error( "Only one suspend point can be marked as final"); @@ -267,18 +286,23 @@ void coro::Shape::buildFrom(Function &F) { FinalSuspendIndex = CoroSuspends.size() - 1; } break; + } case Intrinsic::coro_begin: { auto CB = cast<CoroBeginInst>(II); - if (CB->getId()->getInfo().isPreSplit()) { - if (CoroBegin) - report_fatal_error( + + // Ignore coro id's that aren't pre-split. + auto Id = dyn_cast<CoroIdInst>(CB->getId()); + if (Id && !Id->getInfo().isPreSplit()) + break; + + if (CoroBegin) + report_fatal_error( "coroutine should have exactly one defining @llvm.coro.begin"); - CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); - CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); - CB->removeAttribute(AttributeList::FunctionIndex, - Attribute::NoDuplicate); - CoroBegin = CB; - } + CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull); + CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias); + CB->removeAttribute(AttributeList::FunctionIndex, + Attribute::NoDuplicate); + CoroBegin = CB; break; } case Intrinsic::coro_end: @@ -310,7 +334,7 @@ void coro::Shape::buildFrom(Function &F) { // Replace all coro.suspend with undef and remove related coro.saves if // present. - for (CoroSuspendInst *CS : CoroSuspends) { + for (AnyCoroSuspendInst *CS : CoroSuspends) { CS->replaceAllUsesWith(UndefValue::get(CS->getType())); CS->eraseFromParent(); if (auto *CoroSave = CS->getCoroSave()) @@ -324,19 +348,136 @@ void coro::Shape::buildFrom(Function &F) { return; } + auto Id = CoroBegin->getId(); + switch (auto IdIntrinsic = Id->getIntrinsicID()) { + case Intrinsic::coro_id: { + auto SwitchId = cast<CoroIdInst>(Id); + this->ABI = coro::ABI::Switch; + this->SwitchLowering.HasFinalSuspend = HasFinalSuspend; + this->SwitchLowering.ResumeSwitch = nullptr; + this->SwitchLowering.PromiseAlloca = SwitchId->getPromise(); + this->SwitchLowering.ResumeEntryBlock = nullptr; + + for (auto AnySuspend : CoroSuspends) { + auto Suspend = dyn_cast<CoroSuspendInst>(AnySuspend); + if (!Suspend) { +#ifndef NDEBUG + AnySuspend->dump(); +#endif + report_fatal_error("coro.id must be paired with coro.suspend"); + } + + if (!Suspend->getCoroSave()) + createCoroSave(CoroBegin, Suspend); + } + break; + } + + case Intrinsic::coro_id_retcon: + case Intrinsic::coro_id_retcon_once: { + auto ContinuationId = cast<AnyCoroIdRetconInst>(Id); + ContinuationId->checkWellFormed(); + this->ABI = (IdIntrinsic == Intrinsic::coro_id_retcon + ? coro::ABI::Retcon + : coro::ABI::RetconOnce); + auto Prototype = ContinuationId->getPrototype(); + this->RetconLowering.ResumePrototype = Prototype; + this->RetconLowering.Alloc = ContinuationId->getAllocFunction(); + this->RetconLowering.Dealloc = ContinuationId->getDeallocFunction(); + this->RetconLowering.ReturnBlock = nullptr; + this->RetconLowering.IsFrameInlineInStorage = false; + + // Determine the result value types, and make sure they match up with + // the values passed to the suspends. + auto ResultTys = getRetconResultTypes(); + auto ResumeTys = getRetconResumeTypes(); + + for (auto AnySuspend : CoroSuspends) { + auto Suspend = dyn_cast<CoroSuspendRetconInst>(AnySuspend); + if (!Suspend) { +#ifndef NDEBUG + AnySuspend->dump(); +#endif + report_fatal_error("coro.id.retcon.* must be paired with " + "coro.suspend.retcon"); + } + + // Check that the argument types of the suspend match the results. + auto SI = Suspend->value_begin(), SE = Suspend->value_end(); + auto RI = ResultTys.begin(), RE = ResultTys.end(); + for (; SI != SE && RI != RE; ++SI, ++RI) { + auto SrcTy = (*SI)->getType(); + if (SrcTy != *RI) { + // The optimizer likes to eliminate bitcasts leading into variadic + // calls, but that messes with our invariants. Re-insert the + // bitcast and ignore this type mismatch. + if (CastInst::isBitCastable(SrcTy, *RI)) { + auto BCI = new BitCastInst(*SI, *RI, "", Suspend); + SI->set(BCI); + continue; + } + +#ifndef NDEBUG + Suspend->dump(); + Prototype->getFunctionType()->dump(); +#endif + report_fatal_error("argument to coro.suspend.retcon does not " + "match corresponding prototype function result"); + } + } + if (SI != SE || RI != RE) { +#ifndef NDEBUG + Suspend->dump(); + Prototype->getFunctionType()->dump(); +#endif + report_fatal_error("wrong number of arguments to coro.suspend.retcon"); + } + + // Check that the result type of the suspend matches the resume types. + Type *SResultTy = Suspend->getType(); + ArrayRef<Type*> SuspendResultTys; + if (SResultTy->isVoidTy()) { + // leave as empty array + } else if (auto SResultStructTy = dyn_cast<StructType>(SResultTy)) { + SuspendResultTys = SResultStructTy->elements(); + } else { + // forms an ArrayRef using SResultTy, be careful + SuspendResultTys = SResultTy; + } + if (SuspendResultTys.size() != ResumeTys.size()) { +#ifndef NDEBUG + Suspend->dump(); + Prototype->getFunctionType()->dump(); +#endif + report_fatal_error("wrong number of results from coro.suspend.retcon"); + } + for (size_t I = 0, E = ResumeTys.size(); I != E; ++I) { + if (SuspendResultTys[I] != ResumeTys[I]) { +#ifndef NDEBUG + Suspend->dump(); + Prototype->getFunctionType()->dump(); +#endif + report_fatal_error("result from coro.suspend.retcon does not " + "match corresponding prototype function param"); + } + } + } + break; + } + + default: + llvm_unreachable("coro.begin is not dependent on a coro.id call"); + } + // The coro.free intrinsic is always lowered to the result of coro.begin. for (CoroFrameInst *CF : CoroFrames) { CF->replaceAllUsesWith(CoroBegin); CF->eraseFromParent(); } - // Canonicalize coro.suspend by inserting a coro.save if needed. - for (CoroSuspendInst *CS : CoroSuspends) - if (!CS->getCoroSave()) - createCoroSave(CoroBegin, CS); - // Move final suspend to be the last element in the CoroSuspends vector. - if (HasFinalSuspend && + if (ABI == coro::ABI::Switch && + SwitchLowering.HasFinalSuspend && FinalSuspendIndex != CoroSuspends.size() - 1) std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back()); @@ -345,18 +486,166 @@ void coro::Shape::buildFrom(Function &F) { CoroSave->eraseFromParent(); } +static void propagateCallAttrsFromCallee(CallInst *Call, Function *Callee) { + Call->setCallingConv(Callee->getCallingConv()); + // TODO: attributes? +} + +static void addCallToCallGraph(CallGraph *CG, CallInst *Call, Function *Callee){ + if (CG) + (*CG)[Call->getFunction()]->addCalledFunction(Call, (*CG)[Callee]); +} + +Value *coro::Shape::emitAlloc(IRBuilder<> &Builder, Value *Size, + CallGraph *CG) const { + switch (ABI) { + case coro::ABI::Switch: + llvm_unreachable("can't allocate memory in coro switch-lowering"); + + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: { + auto Alloc = RetconLowering.Alloc; + Size = Builder.CreateIntCast(Size, + Alloc->getFunctionType()->getParamType(0), + /*is signed*/ false); + auto *Call = Builder.CreateCall(Alloc, Size); + propagateCallAttrsFromCallee(Call, Alloc); + addCallToCallGraph(CG, Call, Alloc); + return Call; + } + } + llvm_unreachable("Unknown coro::ABI enum"); +} + +void coro::Shape::emitDealloc(IRBuilder<> &Builder, Value *Ptr, + CallGraph *CG) const { + switch (ABI) { + case coro::ABI::Switch: + llvm_unreachable("can't allocate memory in coro switch-lowering"); + + case coro::ABI::Retcon: + case coro::ABI::RetconOnce: { + auto Dealloc = RetconLowering.Dealloc; + Ptr = Builder.CreateBitCast(Ptr, + Dealloc->getFunctionType()->getParamType(0)); + auto *Call = Builder.CreateCall(Dealloc, Ptr); + propagateCallAttrsFromCallee(Call, Dealloc); + addCallToCallGraph(CG, Call, Dealloc); + return; + } + } + llvm_unreachable("Unknown coro::ABI enum"); +} + +LLVM_ATTRIBUTE_NORETURN +static void fail(const Instruction *I, const char *Reason, Value *V) { +#ifndef NDEBUG + I->dump(); + if (V) { + errs() << " Value: "; + V->printAsOperand(llvm::errs()); + errs() << '\n'; + } +#endif + report_fatal_error(Reason); +} + +/// Check that the given value is a well-formed prototype for the +/// llvm.coro.id.retcon.* intrinsics. +static void checkWFRetconPrototype(const AnyCoroIdRetconInst *I, Value *V) { + auto F = dyn_cast<Function>(V->stripPointerCasts()); + if (!F) + fail(I, "llvm.coro.id.retcon.* prototype not a Function", V); + + auto FT = F->getFunctionType(); + + if (isa<CoroIdRetconInst>(I)) { + bool ResultOkay; + if (FT->getReturnType()->isPointerTy()) { + ResultOkay = true; + } else if (auto SRetTy = dyn_cast<StructType>(FT->getReturnType())) { + ResultOkay = (!SRetTy->isOpaque() && + SRetTy->getNumElements() > 0 && + SRetTy->getElementType(0)->isPointerTy()); + } else { + ResultOkay = false; + } + if (!ResultOkay) + fail(I, "llvm.coro.id.retcon prototype must return pointer as first " + "result", F); + + if (FT->getReturnType() != + I->getFunction()->getFunctionType()->getReturnType()) + fail(I, "llvm.coro.id.retcon prototype return type must be same as" + "current function return type", F); + } else { + // No meaningful validation to do here for llvm.coro.id.unique.once. + } + + if (FT->getNumParams() == 0 || !FT->getParamType(0)->isPointerTy()) + fail(I, "llvm.coro.id.retcon.* prototype must take pointer as " + "its first parameter", F); +} + +/// Check that the given value is a well-formed allocator. +static void checkWFAlloc(const Instruction *I, Value *V) { + auto F = dyn_cast<Function>(V->stripPointerCasts()); + if (!F) + fail(I, "llvm.coro.* allocator not a Function", V); + + auto FT = F->getFunctionType(); + if (!FT->getReturnType()->isPointerTy()) + fail(I, "llvm.coro.* allocator must return a pointer", F); + + if (FT->getNumParams() != 1 || + !FT->getParamType(0)->isIntegerTy()) + fail(I, "llvm.coro.* allocator must take integer as only param", F); +} + +/// Check that the given value is a well-formed deallocator. +static void checkWFDealloc(const Instruction *I, Value *V) { + auto F = dyn_cast<Function>(V->stripPointerCasts()); + if (!F) + fail(I, "llvm.coro.* deallocator not a Function", V); + + auto FT = F->getFunctionType(); + if (!FT->getReturnType()->isVoidTy()) + fail(I, "llvm.coro.* deallocator must return void", F); + + if (FT->getNumParams() != 1 || + !FT->getParamType(0)->isPointerTy()) + fail(I, "llvm.coro.* deallocator must take pointer as only param", F); +} + +static void checkConstantInt(const Instruction *I, Value *V, + const char *Reason) { + if (!isa<ConstantInt>(V)) { + fail(I, Reason, V); + } +} + +void AnyCoroIdRetconInst::checkWellFormed() const { + checkConstantInt(this, getArgOperand(SizeArg), + "size argument to coro.id.retcon.* must be constant"); + checkConstantInt(this, getArgOperand(AlignArg), + "alignment argument to coro.id.retcon.* must be constant"); + checkWFRetconPrototype(this, getArgOperand(PrototypeArg)); + checkWFAlloc(this, getArgOperand(AllocArg)); + checkWFDealloc(this, getArgOperand(DeallocArg)); +} + void LLVMAddCoroEarlyPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCoroEarlyPass()); + unwrap(PM)->add(createCoroEarlyLegacyPass()); } void LLVMAddCoroSplitPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCoroSplitPass()); + unwrap(PM)->add(createCoroSplitLegacyPass()); } void LLVMAddCoroElidePass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCoroElidePass()); + unwrap(PM)->add(createCoroElideLegacyPass()); } void LLVMAddCoroCleanupPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createCoroCleanupPass()); + unwrap(PM)->add(createCoroCleanupLegacyPass()); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp index c50805692b98..06d1763353f4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/AlwaysInliner.cpp @@ -22,6 +22,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Inliner.h" #include "llvm/Transforms/Utils/Cloning.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index 95a9f31cced3..cdf8a2eb598e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -70,6 +70,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" @@ -304,7 +305,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, // of the previous load. LoadInst *newLoad = IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val"); - newLoad->setAlignment(OrigLoad->getAlignment()); + newLoad->setAlignment(MaybeAlign(OrigLoad->getAlignment())); // Transfer the AA info too. AAMDNodes AAInfo; OrigLoad->getAAMetadata(AAInfo); @@ -386,8 +387,9 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote, // Just add all the struct element types. Type *AgTy = cast<PointerType>(I->getType())->getElementType(); - Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr, - I->getParamAlignment(), "", InsertPt); + Value *TheAlloca = + new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr, + MaybeAlign(I->getParamAlignment()), "", InsertPt); StructType *STy = cast<StructType>(AgTy); Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr}; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp index 2a52c6b9b4ad..f2995817eaf8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Attributor.cpp @@ -16,20 +16,31 @@ #include "llvm/Transforms/IPO/Attributor.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/LazyValueInfo.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" + #include <cassert> using namespace llvm; @@ -46,19 +57,82 @@ STATISTIC(NumAttributesValidFixpoint, "Number of abstract attributes in a valid fixpoint state"); STATISTIC(NumAttributesManifested, "Number of abstract attributes manifested in IR"); -STATISTIC(NumFnNoUnwind, "Number of functions marked nounwind"); - -STATISTIC(NumFnUniqueReturned, "Number of function with unique return"); -STATISTIC(NumFnKnownReturns, "Number of function with known return values"); -STATISTIC(NumFnArgumentReturned, - "Number of function arguments marked returned"); -STATISTIC(NumFnNoSync, "Number of functions marked nosync"); -STATISTIC(NumFnNoFree, "Number of functions marked nofree"); -STATISTIC(NumFnReturnedNonNull, - "Number of function return values marked nonnull"); -STATISTIC(NumFnArgumentNonNull, "Number of function arguments marked nonnull"); -STATISTIC(NumCSArgumentNonNull, "Number of call site arguments marked nonnull"); -STATISTIC(NumFnWillReturn, "Number of functions marked willreturn"); +STATISTIC(NumAttributesFixedDueToRequiredDependences, + "Number of abstract attributes fixed due to required dependences"); + +// Some helper macros to deal with statistics tracking. +// +// Usage: +// For simple IR attribute tracking overload trackStatistics in the abstract +// attribute and choose the right STATS_DECLTRACK_********* macro, +// e.g.,: +// void trackStatistics() const override { +// STATS_DECLTRACK_ARG_ATTR(returned) +// } +// If there is a single "increment" side one can use the macro +// STATS_DECLTRACK with a custom message. If there are multiple increment +// sides, STATS_DECL and STATS_TRACK can also be used separatly. +// +#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME) \ + ("Number of " #TYPE " marked '" #NAME "'") +#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME +#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG); +#define STATS_DECL(NAME, TYPE, MSG) \ + STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG); +#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE)); +#define STATS_DECLTRACK(NAME, TYPE, MSG) \ + { \ + STATS_DECL(NAME, TYPE, MSG) \ + STATS_TRACK(NAME, TYPE) \ + } +#define STATS_DECLTRACK_ARG_ATTR(NAME) \ + STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME)) +#define STATS_DECLTRACK_CSARG_ATTR(NAME) \ + STATS_DECLTRACK(NAME, CSArguments, \ + BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME)) +#define STATS_DECLTRACK_FN_ATTR(NAME) \ + STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME)) +#define STATS_DECLTRACK_CS_ATTR(NAME) \ + STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME)) +#define STATS_DECLTRACK_FNRET_ATTR(NAME) \ + STATS_DECLTRACK(NAME, FunctionReturn, \ + BUILD_STAT_MSG_IR_ATTR(function returns, NAME)) +#define STATS_DECLTRACK_CSRET_ATTR(NAME) \ + STATS_DECLTRACK(NAME, CSReturn, \ + BUILD_STAT_MSG_IR_ATTR(call site returns, NAME)) +#define STATS_DECLTRACK_FLOATING_ATTR(NAME) \ + STATS_DECLTRACK(NAME, Floating, \ + ("Number of floating values known to be '" #NAME "'")) + +// Specialization of the operator<< for abstract attributes subclasses. This +// disambiguates situations where multiple operators are applicable. +namespace llvm { +#define PIPE_OPERATOR(CLASS) \ + raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) { \ + return OS << static_cast<const AbstractAttribute &>(AA); \ + } + +PIPE_OPERATOR(AAIsDead) +PIPE_OPERATOR(AANoUnwind) +PIPE_OPERATOR(AANoSync) +PIPE_OPERATOR(AANoRecurse) +PIPE_OPERATOR(AAWillReturn) +PIPE_OPERATOR(AANoReturn) +PIPE_OPERATOR(AAReturnedValues) +PIPE_OPERATOR(AANonNull) +PIPE_OPERATOR(AANoAlias) +PIPE_OPERATOR(AADereferenceable) +PIPE_OPERATOR(AAAlign) +PIPE_OPERATOR(AANoCapture) +PIPE_OPERATOR(AAValueSimplify) +PIPE_OPERATOR(AANoFree) +PIPE_OPERATOR(AAHeapToStack) +PIPE_OPERATOR(AAReachability) +PIPE_OPERATOR(AAMemoryBehavior) +PIPE_OPERATOR(AAValueConstantRange) + +#undef PIPE_OPERATOR +} // namespace llvm // TODO: Determine a good default value. // @@ -72,18 +146,36 @@ static cl::opt<unsigned> MaxFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32)); +static cl::opt<bool> VerifyMaxFixpointIterations( + "attributor-max-iterations-verify", cl::Hidden, + cl::desc("Verify that max-iterations is a tight bound for a fixpoint"), + cl::init(false)); static cl::opt<bool> DisableAttributor( "attributor-disable", cl::Hidden, cl::desc("Disable the attributor inter-procedural deduction pass."), cl::init(true)); -static cl::opt<bool> VerifyAttributor( - "attributor-verify", cl::Hidden, - cl::desc("Verify the Attributor deduction and " - "manifestation of attributes -- may issue false-positive errors"), +static cl::opt<bool> AnnotateDeclarationCallSites( + "attributor-annotate-decl-cs", cl::Hidden, + cl::desc("Annotate call sites of function declarations."), cl::init(false)); + +static cl::opt<bool> ManifestInternal( + "attributor-manifest-internal", cl::Hidden, + cl::desc("Manifest Attributor internal string attributes."), cl::init(false)); +static cl::opt<unsigned> DepRecInterval( + "attributor-dependence-recompute-interval", cl::Hidden, + cl::desc("Number of iterations until dependences are recomputed."), + cl::init(4)); + +static cl::opt<bool> EnableHeapToStack("enable-heap-to-stack-conversion", + cl::init(true), cl::Hidden); + +static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128), + cl::Hidden); + /// Logic operators for the change status enum class. /// ///{ @@ -95,78 +187,98 @@ ChangeStatus llvm::operator&(ChangeStatus l, ChangeStatus r) { } ///} -/// Helper to adjust the statistics. -static void bookkeeping(AbstractAttribute::ManifestPosition MP, - const Attribute &Attr) { - if (!AreStatisticsEnabled()) - return; +Argument *IRPosition::getAssociatedArgument() const { + if (getPositionKind() == IRP_ARGUMENT) + return cast<Argument>(&getAnchorValue()); + + // Not an Argument and no argument number means this is not a call site + // argument, thus we cannot find a callback argument to return. + int ArgNo = getArgNo(); + if (ArgNo < 0) + return nullptr; + + // Use abstract call sites to make the connection between the call site + // values and the ones in callbacks. If a callback was found that makes use + // of the underlying call site operand, we want the corresponding callback + // callee argument and not the direct callee argument. + Optional<Argument *> CBCandidateArg; + SmallVector<const Use *, 4> CBUses; + ImmutableCallSite ICS(&getAnchorValue()); + AbstractCallSite::getCallbackUses(ICS, CBUses); + for (const Use *U : CBUses) { + AbstractCallSite ACS(U); + assert(ACS && ACS.isCallbackCall()); + if (!ACS.getCalledFunction()) + continue; - if (!Attr.isEnumAttribute()) - return; - switch (Attr.getKindAsEnum()) { - case Attribute::NoUnwind: - NumFnNoUnwind++; - return; - case Attribute::Returned: - NumFnArgumentReturned++; - return; - case Attribute::NoSync: - NumFnNoSync++; - break; - case Attribute::NoFree: - NumFnNoFree++; - break; - case Attribute::NonNull: - switch (MP) { - case AbstractAttribute::MP_RETURNED: - NumFnReturnedNonNull++; - break; - case AbstractAttribute::MP_ARGUMENT: - NumFnArgumentNonNull++; - break; - case AbstractAttribute::MP_CALL_SITE_ARGUMENT: - NumCSArgumentNonNull++; - break; - default: - break; + for (unsigned u = 0, e = ACS.getNumArgOperands(); u < e; u++) { + + // Test if the underlying call site operand is argument number u of the + // callback callee. + if (ACS.getCallArgOperandNo(u) != ArgNo) + continue; + + assert(ACS.getCalledFunction()->arg_size() > u && + "ACS mapped into var-args arguments!"); + if (CBCandidateArg.hasValue()) { + CBCandidateArg = nullptr; + break; + } + CBCandidateArg = ACS.getCalledFunction()->getArg(u); } - break; - case Attribute::WillReturn: - NumFnWillReturn++; - break; - default: - return; } + + // If we found a unique callback candidate argument, return it. + if (CBCandidateArg.hasValue() && CBCandidateArg.getValue()) + return CBCandidateArg.getValue(); + + // If no callbacks were found, or none used the underlying call site operand + // exclusively, use the direct callee argument if available. + const Function *Callee = ICS.getCalledFunction(); + if (Callee && Callee->arg_size() > unsigned(ArgNo)) + return Callee->getArg(ArgNo); + + return nullptr; } -template <typename StateTy> -using followValueCB_t = std::function<bool(Value *, StateTy &State)>; -template <typename StateTy> -using visitValueCB_t = std::function<void(Value *, StateTy &State)>; +/// For calls (and invokes) we will only replace instruction uses to not disturb +/// the old style call graph. +/// TODO: Remove this once we get rid of the old PM. +static void replaceAllInstructionUsesWith(Value &Old, Value &New) { + if (!isa<CallBase>(Old)) + return Old.replaceAllUsesWith(&New); + SmallVector<Use *, 8> Uses; + for (Use &U : Old.uses()) + if (isa<Instruction>(U.getUser())) + Uses.push_back(&U); + for (Use *U : Uses) + U->set(&New); +} -/// Recursively visit all values that might become \p InitV at some point. This +/// Recursively visit all values that might become \p IRP at some point. This /// will be done by looking through cast instructions, selects, phis, and calls -/// with the "returned" attribute. The callback \p FollowValueCB is asked before -/// a potential origin value is looked at. If no \p FollowValueCB is passed, a -/// default one is used that will make sure we visit every value only once. Once -/// we cannot look through the value any further, the callback \p VisitValueCB -/// is invoked and passed the current value and the \p State. To limit how much -/// effort is invested, we will never visit more than \p MaxValues values. -template <typename StateTy> +/// with the "returned" attribute. Once we cannot look through the value any +/// further, the callback \p VisitValueCB is invoked and passed the current +/// value, the \p State, and a flag to indicate if we stripped anything. To +/// limit how much effort is invested, we will never visit more values than +/// specified by \p MaxValues. +template <typename AAType, typename StateTy> static bool genericValueTraversal( - Value *InitV, StateTy &State, visitValueCB_t<StateTy> &VisitValueCB, - followValueCB_t<StateTy> *FollowValueCB = nullptr, int MaxValues = 8) { - + Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State, + const function_ref<bool(Value &, StateTy &, bool)> &VisitValueCB, + int MaxValues = 8) { + + const AAIsDead *LivenessAA = nullptr; + if (IRP.getAnchorScope()) + LivenessAA = &A.getAAFor<AAIsDead>( + QueryingAA, IRPosition::function(*IRP.getAnchorScope()), + /* TrackDependence */ false); + bool AnyDead = false; + + // TODO: Use Positions here to allow context sensitivity in VisitValueCB SmallPtrSet<Value *, 16> Visited; - followValueCB_t<bool> DefaultFollowValueCB = [&](Value *Val, bool &) { - return Visited.insert(Val).second; - }; - - if (!FollowValueCB) - FollowValueCB = &DefaultFollowValueCB; - SmallVector<Value *, 16> Worklist; - Worklist.push_back(InitV); + Worklist.push_back(&IRP.getAssociatedValue()); int Iteration = 0; do { @@ -174,7 +286,7 @@ static bool genericValueTraversal( // Check if we should process the current value. To prevent endless // recursion keep a record of the values we followed! - if (!(*FollowValueCB)(V, State)) + if (!Visited.insert(V).second) continue; // Make sure we limit the compile time for complex expressions. @@ -183,23 +295,23 @@ static bool genericValueTraversal( // Explicitly look through calls with a "returned" attribute if we do // not have a pointer as stripPointerCasts only works on them. + Value *NewV = nullptr; if (V->getType()->isPointerTy()) { - V = V->stripPointerCasts(); + NewV = V->stripPointerCasts(); } else { CallSite CS(V); if (CS && CS.getCalledFunction()) { - Value *NewV = nullptr; for (Argument &Arg : CS.getCalledFunction()->args()) if (Arg.hasReturnedAttr()) { NewV = CS.getArgOperand(Arg.getArgNo()); break; } - if (NewV) { - Worklist.push_back(NewV); - continue; - } } } + if (NewV && NewV != V) { + Worklist.push_back(NewV); + continue; + } // Look through select instructions, visit both potential values. if (auto *SI = dyn_cast<SelectInst>(V)) { @@ -208,35 +320,34 @@ static bool genericValueTraversal( continue; } - // Look through phi nodes, visit all operands. + // Look through phi nodes, visit all live operands. if (auto *PHI = dyn_cast<PHINode>(V)) { - Worklist.append(PHI->op_begin(), PHI->op_end()); + assert(LivenessAA && + "Expected liveness in the presence of instructions!"); + for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) { + const BasicBlock *IncomingBB = PHI->getIncomingBlock(u); + if (LivenessAA->isAssumedDead(IncomingBB->getTerminator())) { + AnyDead = true; + continue; + } + Worklist.push_back(PHI->getIncomingValue(u)); + } continue; } // Once a leaf is reached we inform the user through the callback. - VisitValueCB(V, State); + if (!VisitValueCB(*V, State, Iteration > 1)) + return false; } while (!Worklist.empty()); + // If we actually used liveness information so we have to record a dependence. + if (AnyDead) + A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL); + // All values have been visited. return true; } -/// Helper to identify the correct offset into an attribute list. -static unsigned getAttrIndex(AbstractAttribute::ManifestPosition MP, - unsigned ArgNo = 0) { - switch (MP) { - case AbstractAttribute::MP_ARGUMENT: - case AbstractAttribute::MP_CALL_SITE_ARGUMENT: - return ArgNo + AttributeList::FirstArgIndex; - case AbstractAttribute::MP_FUNCTION: - return AttributeList::FunctionIndex; - case AbstractAttribute::MP_RETURNED: - return AttributeList::ReturnIndex; - } - llvm_unreachable("Unknown manifest position!"); -} - /// Return true if \p New is equal or worse than \p Old. static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) { if (!Old.isIntAttribute()) @@ -247,12 +358,9 @@ static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) { /// Return true if the information provided by \p Attr was added to the /// attribute list \p Attrs. This is only the case if it was not already present -/// in \p Attrs at the position describe by \p MP and \p ArgNo. +/// in \p Attrs at the position describe by \p PK and \p AttrIdx. static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr, - AttributeList &Attrs, - AbstractAttribute::ManifestPosition MP, - unsigned ArgNo = 0) { - unsigned AttrIdx = getAttrIndex(MP, ArgNo); + AttributeList &Attrs, int AttrIdx) { if (Attr.isEnumAttribute()) { Attribute::AttrKind Kind = Attr.getKindAsEnum(); @@ -270,10 +378,32 @@ static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr, Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); return true; } + if (Attr.isIntAttribute()) { + Attribute::AttrKind Kind = Attr.getKindAsEnum(); + if (Attrs.hasAttribute(AttrIdx, Kind)) + if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind))) + return false; + Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind); + Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr); + return true; + } llvm_unreachable("Expected enum or string attribute!"); } +static const Value * +getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset, + const DataLayout &DL, + bool AllowNonInbounds = false) { + const Value *Ptr = + Attributor::getPointerOperand(I, /* AllowVolatile */ false); + if (!Ptr) + return nullptr; + + return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL, + AllowNonInbounds); +} + ChangeStatus AbstractAttribute::update(Attributor &A) { ChangeStatus HasChanged = ChangeStatus::UNCHANGED; if (getState().isAtFixpoint()) @@ -289,143 +419,515 @@ ChangeStatus AbstractAttribute::update(Attributor &A) { return HasChanged; } -ChangeStatus AbstractAttribute::manifest(Attributor &A) { - assert(getState().isValidState() && - "Attempted to manifest an invalid state!"); - assert(getAssociatedValue() && - "Attempted to manifest an attribute without associated value!"); - - ChangeStatus HasChanged = ChangeStatus::UNCHANGED; - SmallVector<Attribute, 4> DeducedAttrs; - getDeducedAttributes(DeducedAttrs); - - Function &ScopeFn = getAnchorScope(); - LLVMContext &Ctx = ScopeFn.getContext(); - ManifestPosition MP = getManifestPosition(); - - AttributeList Attrs; - SmallVector<unsigned, 4> ArgNos; +ChangeStatus +IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP, + const ArrayRef<Attribute> &DeducedAttrs) { + Function *ScopeFn = IRP.getAssociatedFunction(); + IRPosition::Kind PK = IRP.getPositionKind(); // In the following some generic code that will manifest attributes in // DeducedAttrs if they improve the current IR. Due to the different // annotation positions we use the underlying AttributeList interface. - // Note that MP_CALL_SITE_ARGUMENT can annotate multiple locations. - switch (MP) { - case MP_ARGUMENT: - ArgNos.push_back(cast<Argument>(getAssociatedValue())->getArgNo()); - Attrs = ScopeFn.getAttributes(); + AttributeList Attrs; + switch (PK) { + case IRPosition::IRP_INVALID: + case IRPosition::IRP_FLOAT: + return ChangeStatus::UNCHANGED; + case IRPosition::IRP_ARGUMENT: + case IRPosition::IRP_FUNCTION: + case IRPosition::IRP_RETURNED: + Attrs = ScopeFn->getAttributes(); break; - case MP_FUNCTION: - case MP_RETURNED: - ArgNos.push_back(0); - Attrs = ScopeFn.getAttributes(); + case IRPosition::IRP_CALL_SITE: + case IRPosition::IRP_CALL_SITE_RETURNED: + case IRPosition::IRP_CALL_SITE_ARGUMENT: + Attrs = ImmutableCallSite(&IRP.getAnchorValue()).getAttributes(); break; - case MP_CALL_SITE_ARGUMENT: { - CallSite CS(&getAnchoredValue()); - for (unsigned u = 0, e = CS.getNumArgOperands(); u != e; u++) - if (CS.getArgOperand(u) == getAssociatedValue()) - ArgNos.push_back(u); - Attrs = CS.getAttributes(); - } } + ChangeStatus HasChanged = ChangeStatus::UNCHANGED; + LLVMContext &Ctx = IRP.getAnchorValue().getContext(); for (const Attribute &Attr : DeducedAttrs) { - for (unsigned ArgNo : ArgNos) { - if (!addIfNotExistent(Ctx, Attr, Attrs, MP, ArgNo)) - continue; + if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx())) + continue; - HasChanged = ChangeStatus::CHANGED; - bookkeeping(MP, Attr); - } + HasChanged = ChangeStatus::CHANGED; } if (HasChanged == ChangeStatus::UNCHANGED) return HasChanged; - switch (MP) { - case MP_ARGUMENT: - case MP_FUNCTION: - case MP_RETURNED: - ScopeFn.setAttributes(Attrs); + switch (PK) { + case IRPosition::IRP_ARGUMENT: + case IRPosition::IRP_FUNCTION: + case IRPosition::IRP_RETURNED: + ScopeFn->setAttributes(Attrs); + break; + case IRPosition::IRP_CALL_SITE: + case IRPosition::IRP_CALL_SITE_RETURNED: + case IRPosition::IRP_CALL_SITE_ARGUMENT: + CallSite(&IRP.getAnchorValue()).setAttributes(Attrs); + break; + case IRPosition::IRP_INVALID: + case IRPosition::IRP_FLOAT: break; - case MP_CALL_SITE_ARGUMENT: - CallSite(&getAnchoredValue()).setAttributes(Attrs); } return HasChanged; } -Function &AbstractAttribute::getAnchorScope() { - Value &V = getAnchoredValue(); - if (isa<Function>(V)) - return cast<Function>(V); - if (isa<Argument>(V)) - return *cast<Argument>(V).getParent(); - if (isa<Instruction>(V)) - return *cast<Instruction>(V).getFunction(); - llvm_unreachable("No scope for anchored value found!"); +const IRPosition IRPosition::EmptyKey(255); +const IRPosition IRPosition::TombstoneKey(256); + +SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) { + IRPositions.emplace_back(IRP); + + ImmutableCallSite ICS(&IRP.getAnchorValue()); + switch (IRP.getPositionKind()) { + case IRPosition::IRP_INVALID: + case IRPosition::IRP_FLOAT: + case IRPosition::IRP_FUNCTION: + return; + case IRPosition::IRP_ARGUMENT: + case IRPosition::IRP_RETURNED: + IRPositions.emplace_back( + IRPosition::function(*IRP.getAssociatedFunction())); + return; + case IRPosition::IRP_CALL_SITE: + assert(ICS && "Expected call site!"); + // TODO: We need to look at the operand bundles similar to the redirection + // in CallBase. + if (!ICS.hasOperandBundles()) + if (const Function *Callee = ICS.getCalledFunction()) + IRPositions.emplace_back(IRPosition::function(*Callee)); + return; + case IRPosition::IRP_CALL_SITE_RETURNED: + assert(ICS && "Expected call site!"); + // TODO: We need to look at the operand bundles similar to the redirection + // in CallBase. + if (!ICS.hasOperandBundles()) { + if (const Function *Callee = ICS.getCalledFunction()) { + IRPositions.emplace_back(IRPosition::returned(*Callee)); + IRPositions.emplace_back(IRPosition::function(*Callee)); + } + } + IRPositions.emplace_back( + IRPosition::callsite_function(cast<CallBase>(*ICS.getInstruction()))); + return; + case IRPosition::IRP_CALL_SITE_ARGUMENT: { + int ArgNo = IRP.getArgNo(); + assert(ICS && ArgNo >= 0 && "Expected call site!"); + // TODO: We need to look at the operand bundles similar to the redirection + // in CallBase. + if (!ICS.hasOperandBundles()) { + const Function *Callee = ICS.getCalledFunction(); + if (Callee && Callee->arg_size() > unsigned(ArgNo)) + IRPositions.emplace_back(IRPosition::argument(*Callee->getArg(ArgNo))); + if (Callee) + IRPositions.emplace_back(IRPosition::function(*Callee)); + } + IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue())); + return; + } + } } -const Function &AbstractAttribute::getAnchorScope() const { - return const_cast<AbstractAttribute *>(this)->getAnchorScope(); +bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs, + bool IgnoreSubsumingPositions) const { + for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) { + for (Attribute::AttrKind AK : AKs) + if (EquivIRP.getAttr(AK).getKindAsEnum() == AK) + return true; + // The first position returned by the SubsumingPositionIterator is + // always the position itself. If we ignore subsuming positions we + // are done after the first iteration. + if (IgnoreSubsumingPositions) + break; + } + return false; } -/// -----------------------NoUnwind Function Attribute-------------------------- +void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs, + SmallVectorImpl<Attribute> &Attrs, + bool IgnoreSubsumingPositions) const { + for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) { + for (Attribute::AttrKind AK : AKs) { + const Attribute &Attr = EquivIRP.getAttr(AK); + if (Attr.getKindAsEnum() == AK) + Attrs.push_back(Attr); + } + // The first position returned by the SubsumingPositionIterator is + // always the position itself. If we ignore subsuming positions we + // are done after the first iteration. + if (IgnoreSubsumingPositions) + break; + } +} -struct AANoUnwindFunction : AANoUnwind, BooleanState { +void IRPosition::verify() { + switch (KindOrArgNo) { + default: + assert(KindOrArgNo >= 0 && "Expected argument or call site argument!"); + assert((isa<CallBase>(AnchorVal) || isa<Argument>(AnchorVal)) && + "Expected call base or argument for positive attribute index!"); + if (isa<Argument>(AnchorVal)) { + assert(cast<Argument>(AnchorVal)->getArgNo() == unsigned(getArgNo()) && + "Argument number mismatch!"); + assert(cast<Argument>(AnchorVal) == &getAssociatedValue() && + "Associated value mismatch!"); + } else { + assert(cast<CallBase>(*AnchorVal).arg_size() > unsigned(getArgNo()) && + "Call site argument number mismatch!"); + assert(cast<CallBase>(*AnchorVal).getArgOperand(getArgNo()) == + &getAssociatedValue() && + "Associated value mismatch!"); + } + break; + case IRP_INVALID: + assert(!AnchorVal && "Expected no value for an invalid position!"); + break; + case IRP_FLOAT: + assert((!isa<CallBase>(&getAssociatedValue()) && + !isa<Argument>(&getAssociatedValue())) && + "Expected specialized kind for call base and argument values!"); + break; + case IRP_RETURNED: + assert(isa<Function>(AnchorVal) && + "Expected function for a 'returned' position!"); + assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!"); + break; + case IRP_CALL_SITE_RETURNED: + assert((isa<CallBase>(AnchorVal)) && + "Expected call base for 'call site returned' position!"); + assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!"); + break; + case IRP_CALL_SITE: + assert((isa<CallBase>(AnchorVal)) && + "Expected call base for 'call site function' position!"); + assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!"); + break; + case IRP_FUNCTION: + assert(isa<Function>(AnchorVal) && + "Expected function for a 'function' position!"); + assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!"); + break; + } +} - AANoUnwindFunction(Function &F, InformationCache &InfoCache) - : AANoUnwind(F, InfoCache) {} +namespace { +/// Helper function to clamp a state \p S of type \p StateType with the +/// information in \p R and indicate/return if \p S did change (as-in update is +/// required to be run again). +template <typename StateType> +ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) { + auto Assumed = S.getAssumed(); + S ^= R; + return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; +} - /// See AbstractAttribute::getState() - /// { - AbstractState &getState() override { return *this; } - const AbstractState &getState() const override { return *this; } - /// } +/// Clamp the information known for all returned values of a function +/// (identified by \p QueryingAA) into \p S. +template <typename AAType, typename StateType = typename AAType::StateType> +static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA, + StateType &S) { + LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for " + << QueryingAA << " into " << S << "\n"); + + assert((QueryingAA.getIRPosition().getPositionKind() == + IRPosition::IRP_RETURNED || + QueryingAA.getIRPosition().getPositionKind() == + IRPosition::IRP_CALL_SITE_RETURNED) && + "Can only clamp returned value states for a function returned or call " + "site returned position!"); + + // Use an optional state as there might not be any return values and we want + // to join (IntegerState::operator&) the state of all there are. + Optional<StateType> T; + + // Callback for each possibly returned value. + auto CheckReturnValue = [&](Value &RV) -> bool { + const IRPosition &RVPos = IRPosition::value(RV); + const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos); + LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr() + << " @ " << RVPos << "\n"); + const StateType &AAS = static_cast<const StateType &>(AA.getState()); + if (T.hasValue()) + *T &= AAS; + else + T = AAS; + LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T + << "\n"); + return T->isValidState(); + }; - /// See AbstractAttribute::getManifestPosition(). - ManifestPosition getManifestPosition() const override { return MP_FUNCTION; } + if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA)) + S.indicatePessimisticFixpoint(); + else if (T.hasValue()) + S ^= *T; +} - const std::string getAsStr() const override { - return getAssumed() ? "nounwind" : "may-unwind"; +/// Helper class to compose two generic deduction +template <typename AAType, typename Base, typename StateType, + template <typename...> class F, template <typename...> class G> +struct AAComposeTwoGenericDeduction + : public F<AAType, G<AAType, Base, StateType>, StateType> { + AAComposeTwoGenericDeduction(const IRPosition &IRP) + : F<AAType, G<AAType, Base, StateType>, StateType>(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus ChangedF = + F<AAType, G<AAType, Base, StateType>, StateType>::updateImpl(A); + ChangeStatus ChangedG = G<AAType, Base, StateType>::updateImpl(A); + return ChangedF | ChangedG; } +}; + +/// Helper class for generic deduction: return value -> returned position. +template <typename AAType, typename Base, + typename StateType = typename AAType::StateType> +struct AAReturnedFromReturnedValues : public Base { + AAReturnedFromReturnedValues(const IRPosition &IRP) : Base(IRP) {} /// See AbstractAttribute::updateImpl(...). - ChangeStatus updateImpl(Attributor &A) override; + ChangeStatus updateImpl(Attributor &A) override { + StateType S; + clampReturnedValueStates<AAType, StateType>(A, *this, S); + // TODO: If we know we visited all returned values, thus no are assumed + // dead, we can take the known information from the state T. + return clampStateAndIndicateChange<StateType>(this->getState(), S); + } +}; + +/// Clamp the information known at all call sites for a given argument +/// (identified by \p QueryingAA) into \p S. +template <typename AAType, typename StateType = typename AAType::StateType> +static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA, + StateType &S) { + LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for " + << QueryingAA << " into " << S << "\n"); + + assert(QueryingAA.getIRPosition().getPositionKind() == + IRPosition::IRP_ARGUMENT && + "Can only clamp call site argument states for an argument position!"); + + // Use an optional state as there might not be any return values and we want + // to join (IntegerState::operator&) the state of all there are. + Optional<StateType> T; + + // The argument number which is also the call site argument number. + unsigned ArgNo = QueryingAA.getIRPosition().getArgNo(); + + auto CallSiteCheck = [&](AbstractCallSite ACS) { + const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo); + // Check if a coresponding argument was found or if it is on not associated + // (which can happen for callback calls). + if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID) + return false; + + const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos); + LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction() + << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n"); + const StateType &AAS = static_cast<const StateType &>(AA.getState()); + if (T.hasValue()) + *T &= AAS; + else + T = AAS; + LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T + << "\n"); + return T->isValidState(); + }; + + if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true)) + S.indicatePessimisticFixpoint(); + else if (T.hasValue()) + S ^= *T; +} - /// See AANoUnwind::isAssumedNoUnwind(). - bool isAssumedNoUnwind() const override { return getAssumed(); } +/// Helper class for generic deduction: call site argument -> argument position. +template <typename AAType, typename Base, + typename StateType = typename AAType::StateType> +struct AAArgumentFromCallSiteArguments : public Base { + AAArgumentFromCallSiteArguments(const IRPosition &IRP) : Base(IRP) {} - /// See AANoUnwind::isKnownNoUnwind(). - bool isKnownNoUnwind() const override { return getKnown(); } + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + StateType S; + clampCallSiteArgumentStates<AAType, StateType>(A, *this, S); + // TODO: If we know we visited all incoming values, thus no are assumed + // dead, we can take the known information from the state T. + return clampStateAndIndicateChange<StateType>(this->getState(), S); + } }; -ChangeStatus AANoUnwindFunction::updateImpl(Attributor &A) { - Function &F = getAnchorScope(); +/// Helper class for generic replication: function returned -> cs returned. +template <typename AAType, typename Base, + typename StateType = typename AAType::StateType> +struct AACallSiteReturnedFromReturned : public Base { + AACallSiteReturnedFromReturned(const IRPosition &IRP) : Base(IRP) {} - // The map from instruction opcodes to those instructions in the function. - auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F); - auto Opcodes = { - (unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr, - (unsigned)Instruction::Call, (unsigned)Instruction::CleanupRet, - (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume}; + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + assert(this->getIRPosition().getPositionKind() == + IRPosition::IRP_CALL_SITE_RETURNED && + "Can only wrap function returned positions for call site returned " + "positions!"); + auto &S = this->getState(); + + const Function *AssociatedFunction = + this->getIRPosition().getAssociatedFunction(); + if (!AssociatedFunction) + return S.indicatePessimisticFixpoint(); + + IRPosition FnPos = IRPosition::returned(*AssociatedFunction); + const AAType &AA = A.getAAFor<AAType>(*this, FnPos); + return clampStateAndIndicateChange( + S, static_cast<const typename AAType::StateType &>(AA.getState())); + } +}; - for (unsigned Opcode : Opcodes) { - for (Instruction *I : OpcodeInstMap[Opcode]) { - if (!I->mayThrow()) - continue; +/// Helper class for generic deduction using must-be-executed-context +/// Base class is required to have `followUse` method. - auto *NoUnwindAA = A.getAAFor<AANoUnwind>(*this, *I); +/// bool followUse(Attributor &A, const Use *U, const Instruction *I) +/// U - Underlying use. +/// I - The user of the \p U. +/// `followUse` returns true if the value should be tracked transitively. - if (!NoUnwindAA || !NoUnwindAA->isAssumedNoUnwind()) { - indicatePessimisticFixpoint(); - return ChangeStatus::CHANGED; +template <typename AAType, typename Base, + typename StateType = typename AAType::StateType> +struct AAFromMustBeExecutedContext : public Base { + AAFromMustBeExecutedContext(const IRPosition &IRP) : Base(IRP) {} + + void initialize(Attributor &A) override { + Base::initialize(A); + const IRPosition &IRP = this->getIRPosition(); + Instruction *CtxI = IRP.getCtxI(); + + if (!CtxI) + return; + + for (const Use &U : IRP.getAssociatedValue().uses()) + Uses.insert(&U); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + auto BeforeState = this->getState(); + auto &S = this->getState(); + Instruction *CtxI = this->getIRPosition().getCtxI(); + if (!CtxI) + return ChangeStatus::UNCHANGED; + + MustBeExecutedContextExplorer &Explorer = + A.getInfoCache().getMustBeExecutedContextExplorer(); + + auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI); + for (unsigned u = 0; u < Uses.size(); ++u) { + const Use *U = Uses[u]; + if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) { + bool Found = Explorer.findInContextOf(UserI, EIt, EEnd); + if (Found && Base::followUse(A, U, UserI)) + for (const Use &Us : UserI->uses()) + Uses.insert(&Us); } } + + return BeforeState == S ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED; } - return ChangeStatus::UNCHANGED; -} + +private: + /// Container for (transitive) uses of the associated value. + SetVector<const Use *> Uses; +}; + +template <typename AAType, typename Base, + typename StateType = typename AAType::StateType> +using AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext = + AAComposeTwoGenericDeduction<AAType, Base, StateType, + AAFromMustBeExecutedContext, + AAArgumentFromCallSiteArguments>; + +template <typename AAType, typename Base, + typename StateType = typename AAType::StateType> +using AACallSiteReturnedFromReturnedAndMustBeExecutedContext = + AAComposeTwoGenericDeduction<AAType, Base, StateType, + AAFromMustBeExecutedContext, + AACallSiteReturnedFromReturned>; + +/// -----------------------NoUnwind Function Attribute-------------------------- + +struct AANoUnwindImpl : AANoUnwind { + AANoUnwindImpl(const IRPosition &IRP) : AANoUnwind(IRP) {} + + const std::string getAsStr() const override { + return getAssumed() ? "nounwind" : "may-unwind"; + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + auto Opcodes = { + (unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr, + (unsigned)Instruction::Call, (unsigned)Instruction::CleanupRet, + (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume}; + + auto CheckForNoUnwind = [&](Instruction &I) { + if (!I.mayThrow()) + return true; + + if (ImmutableCallSite ICS = ImmutableCallSite(&I)) { + const auto &NoUnwindAA = + A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(ICS)); + return NoUnwindAA.isAssumedNoUnwind(); + } + return false; + }; + + if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes)) + return indicatePessimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } +}; + +struct AANoUnwindFunction final : public AANoUnwindImpl { + AANoUnwindFunction(const IRPosition &IRP) : AANoUnwindImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) } +}; + +/// NoUnwind attribute deduction for a call sites. +struct AANoUnwindCallSite final : AANoUnwindImpl { + AANoUnwindCallSite(const IRPosition &IRP) : AANoUnwindImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoUnwindImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos); + return clampStateAndIndicateChange( + getState(), + static_cast<const AANoUnwind::StateType &>(FnAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); } +}; /// --------------------- Function Return Values ------------------------------- @@ -434,68 +936,48 @@ ChangeStatus AANoUnwindFunction::updateImpl(Attributor &A) { /// /// If there is a unique returned value R, the manifest method will: /// - mark R with the "returned" attribute, if R is an argument. -class AAReturnedValuesImpl final : public AAReturnedValues, AbstractState { +class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState { /// Mapping of values potentially returned by the associated function to the /// return instructions that might return them. - DenseMap<Value *, SmallPtrSet<ReturnInst *, 2>> ReturnedValues; + MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues; + + /// Mapping to remember the number of returned values for a call site such + /// that we can avoid updates if nothing changed. + DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA; + + /// Set of unresolved calls returned by the associated function. + SmallSetVector<CallBase *, 4> UnresolvedCalls; /// State flags /// ///{ - bool IsFixed; - bool IsValidState; - bool HasOverdefinedReturnedCalls; + bool IsFixed = false; + bool IsValidState = true; ///} - /// Collect values that could become \p V in the set \p Values, each mapped to - /// \p ReturnInsts. - void collectValuesRecursively( - Attributor &A, Value *V, SmallPtrSetImpl<ReturnInst *> &ReturnInsts, - DenseMap<Value *, SmallPtrSet<ReturnInst *, 2>> &Values) { - - visitValueCB_t<bool> VisitValueCB = [&](Value *Val, bool &) { - assert(!isa<Instruction>(Val) || - &getAnchorScope() == cast<Instruction>(Val)->getFunction()); - Values[Val].insert(ReturnInsts.begin(), ReturnInsts.end()); - }; - - bool UnusedBool; - bool Success = genericValueTraversal(V, UnusedBool, VisitValueCB); - - // If we did abort the above traversal we haven't see all the values. - // Consequently, we cannot know if the information we would derive is - // accurate so we give up early. - if (!Success) - indicatePessimisticFixpoint(); - } - public: - /// See AbstractAttribute::AbstractAttribute(...). - AAReturnedValuesImpl(Function &F, InformationCache &InfoCache) - : AAReturnedValues(F, InfoCache) { - // We do not have an associated argument yet. - AssociatedVal = nullptr; - } + AAReturnedValuesImpl(const IRPosition &IRP) : AAReturnedValues(IRP) {} /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { // Reset the state. - AssociatedVal = nullptr; IsFixed = false; IsValidState = true; - HasOverdefinedReturnedCalls = false; ReturnedValues.clear(); - Function &F = cast<Function>(getAnchoredValue()); + Function *F = getAssociatedFunction(); + if (!F) { + indicatePessimisticFixpoint(); + return; + } // The map from instruction opcodes to those instructions in the function. - auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F); + auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F); // Look through all arguments, if one is marked as returned we are done. - for (Argument &Arg : F.args()) { + for (Argument &Arg : F->args()) { if (Arg.hasReturnedAttr()) { - auto &ReturnInstSet = ReturnedValues[&Arg]; for (Instruction *RI : OpcodeInstMap[Instruction::Ret]) ReturnInstSet.insert(cast<ReturnInst>(RI)); @@ -505,13 +987,8 @@ public: } } - // If no argument was marked as returned we look at all return instructions - // and collect potentially returned values. - for (Instruction *RI : OpcodeInstMap[Instruction::Ret]) { - SmallPtrSet<ReturnInst *, 1> RISet({cast<ReturnInst>(RI)}); - collectValuesRecursively(A, cast<ReturnInst>(RI)->getReturnValue(), RISet, - ReturnedValues); - } + if (!F->hasExactDefinition()) + indicatePessimisticFixpoint(); } /// See AbstractAttribute::manifest(...). @@ -523,25 +1000,35 @@ public: /// See AbstractAttribute::getState(...). const AbstractState &getState() const override { return *this; } - /// See AbstractAttribute::getManifestPosition(). - ManifestPosition getManifestPosition() const override { return MP_ARGUMENT; } - /// See AbstractAttribute::updateImpl(Attributor &A). ChangeStatus updateImpl(Attributor &A) override; + llvm::iterator_range<iterator> returned_values() override { + return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end()); + } + + llvm::iterator_range<const_iterator> returned_values() const override { + return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end()); + } + + const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override { + return UnresolvedCalls; + } + /// Return the number of potential return values, -1 if unknown. - size_t getNumReturnValues() const { + size_t getNumReturnValues() const override { return isValidState() ? ReturnedValues.size() : -1; } /// Return an assumed unique return value if a single candidate is found. If /// there cannot be one, return a nullptr. If it is not clear yet, return the /// Optional::NoneType. - Optional<Value *> getAssumedUniqueReturnValue() const; + Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const; - /// See AbstractState::checkForallReturnedValues(...). - bool - checkForallReturnedValues(std::function<bool(Value &)> &Pred) const override; + /// See AbstractState::checkForAllReturnedValues(...). + bool checkForAllReturnedValuesAndReturnInsts( + const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> + &Pred) const override; /// Pretty print the attribute similar to the IR representation. const std::string getAsStr() const override; @@ -553,13 +1040,15 @@ public: bool isValidState() const override { return IsValidState; } /// See AbstractState::indicateOptimisticFixpoint(...). - void indicateOptimisticFixpoint() override { + ChangeStatus indicateOptimisticFixpoint() override { IsFixed = true; - IsValidState &= true; + return ChangeStatus::UNCHANGED; } - void indicatePessimisticFixpoint() override { + + ChangeStatus indicatePessimisticFixpoint() override { IsFixed = true; IsValidState = false; + return ChangeStatus::CHANGED; } }; @@ -568,21 +1057,58 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) { // Bookkeeping. assert(isValidState()); - NumFnKnownReturns++; + STATS_DECLTRACK(KnownReturnValues, FunctionReturn, + "Number of function with known return values"); // Check if we have an assumed unique return value that we could manifest. - Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(); + Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A); if (!UniqueRV.hasValue() || !UniqueRV.getValue()) return Changed; // Bookkeeping. - NumFnUniqueReturned++; + STATS_DECLTRACK(UniqueReturnValue, FunctionReturn, + "Number of function with unique return"); + + // Callback to replace the uses of CB with the constant C. + auto ReplaceCallSiteUsersWith = [](CallBase &CB, Constant &C) { + if (CB.getNumUses() == 0 || CB.isMustTailCall()) + return ChangeStatus::UNCHANGED; + replaceAllInstructionUsesWith(CB, C); + return ChangeStatus::CHANGED; + }; // If the assumed unique return value is an argument, annotate it. if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) { - AssociatedVal = UniqueRVArg; - Changed = AbstractAttribute::manifest(A) | Changed; + // TODO: This should be handled differently! + this->AnchorVal = UniqueRVArg; + this->KindOrArgNo = UniqueRVArg->getArgNo(); + Changed = IRAttribute::manifest(A); + } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) { + // We can replace the returned value with the unique returned constant. + Value &AnchorValue = getAnchorValue(); + if (Function *F = dyn_cast<Function>(&AnchorValue)) { + for (const Use &U : F->uses()) + if (CallBase *CB = dyn_cast<CallBase>(U.getUser())) + if (CB->isCallee(&U)) { + Constant *RVCCast = + CB->getType() == RVC->getType() + ? RVC + : ConstantExpr::getTruncOrBitCast(RVC, CB->getType()); + Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed; + } + } else { + assert(isa<CallBase>(AnchorValue) && + "Expcected a function or call base anchor!"); + Constant *RVCCast = + AnchorValue.getType() == RVC->getType() + ? RVC + : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType()); + Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast); + } + if (Changed == ChangeStatus::CHANGED) + STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn, + "Number of function returns replaced by constant return"); } return Changed; @@ -590,18 +1116,20 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) { const std::string AAReturnedValuesImpl::getAsStr() const { return (isAtFixpoint() ? "returns(#" : "may-return(#") + - (isValidState() ? std::to_string(getNumReturnValues()) : "?") + ")"; + (isValidState() ? std::to_string(getNumReturnValues()) : "?") + + ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]"; } -Optional<Value *> AAReturnedValuesImpl::getAssumedUniqueReturnValue() const { - // If checkForallReturnedValues provides a unique value, ignoring potential +Optional<Value *> +AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const { + // If checkForAllReturnedValues provides a unique value, ignoring potential // undef values that can also be present, it is assumed to be the actual // return value and forwarded to the caller of this method. If there are // multiple, a nullptr is returned indicating there cannot be a unique // returned value. Optional<Value *> UniqueRV; - std::function<bool(Value &)> Pred = [&](Value &RV) -> bool { + auto Pred = [&](Value &RV) -> bool { // If we found a second returned value and neither the current nor the saved // one is an undef, there is no unique returned value. Undefs are special // since we can pretend they have any value. @@ -618,14 +1146,15 @@ Optional<Value *> AAReturnedValuesImpl::getAssumedUniqueReturnValue() const { return true; }; - if (!checkForallReturnedValues(Pred)) + if (!A.checkForAllReturnedValues(Pred, *this)) UniqueRV = nullptr; return UniqueRV; } -bool AAReturnedValuesImpl::checkForallReturnedValues( - std::function<bool(Value &)> &Pred) const { +bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts( + const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> + &Pred) const { if (!isValidState()) return false; @@ -634,11 +1163,11 @@ bool AAReturnedValuesImpl::checkForallReturnedValues( for (auto &It : ReturnedValues) { Value *RV = It.first; - ImmutableCallSite ICS(RV); - if (ICS && !HasOverdefinedReturnedCalls) + CallBase *CB = dyn_cast<CallBase>(RV); + if (CB && !UnresolvedCalls.count(CB)) continue; - if (!Pred(*RV)) + if (!Pred(*RV, It.second)) return false; } @@ -646,125 +1175,195 @@ bool AAReturnedValuesImpl::checkForallReturnedValues( } ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) { + size_t NumUnresolvedCalls = UnresolvedCalls.size(); + bool Changed = false; + + // State used in the value traversals starting in returned values. + struct RVState { + // The map in which we collect return values -> return instrs. + decltype(ReturnedValues) &RetValsMap; + // The flag to indicate a change. + bool &Changed; + // The return instrs we come from. + SmallSetVector<ReturnInst *, 4> RetInsts; + }; - // Check if we know of any values returned by the associated function, - // if not, we are done. - if (getNumReturnValues() == 0) { - indicateOptimisticFixpoint(); - return ChangeStatus::UNCHANGED; - } + // Callback for a leaf value returned by the associated function. + auto VisitValueCB = [](Value &Val, RVState &RVS, bool) -> bool { + auto Size = RVS.RetValsMap[&Val].size(); + RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end()); + bool Inserted = RVS.RetValsMap[&Val].size() != Size; + RVS.Changed |= Inserted; + LLVM_DEBUG({ + if (Inserted) + dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val + << " => " << RVS.RetInsts.size() << "\n"; + }); + return true; + }; - // Check if any of the returned values is a call site we can refine. - decltype(ReturnedValues) AddRVs; - bool HasCallSite = false; + // Helper method to invoke the generic value traversal. + auto VisitReturnedValue = [&](Value &RV, RVState &RVS) { + IRPosition RetValPos = IRPosition::value(RV); + return genericValueTraversal<AAReturnedValues, RVState>(A, RetValPos, *this, + RVS, VisitValueCB); + }; - // Look at all returned call sites. - for (auto &It : ReturnedValues) { - SmallPtrSet<ReturnInst *, 2> &ReturnInsts = It.second; - Value *RV = It.first; - LLVM_DEBUG(dbgs() << "[AAReturnedValues] Potentially returned value " << *RV - << "\n"); + // Callback for all "return intructions" live in the associated function. + auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) { + ReturnInst &Ret = cast<ReturnInst>(I); + RVState RVS({ReturnedValues, Changed, {}}); + RVS.RetInsts.insert(&Ret); + return VisitReturnedValue(*Ret.getReturnValue(), RVS); + }; - // Only call sites can change during an update, ignore the rest. - CallSite RetCS(RV); - if (!RetCS) + // Start by discovering returned values from all live returned instructions in + // the associated function. + if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret})) + return indicatePessimisticFixpoint(); + + // Once returned values "directly" present in the code are handled we try to + // resolve returned calls. + decltype(ReturnedValues) NewRVsMap; + for (auto &It : ReturnedValues) { + LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *It.first + << " by #" << It.second.size() << " RIs\n"); + CallBase *CB = dyn_cast<CallBase>(It.first); + if (!CB || UnresolvedCalls.count(CB)) continue; - // For now, any call site we see will prevent us from directly fixing the - // state. However, if the information on the callees is fixed, the call - // sites will be removed and we will fix the information for this state. - HasCallSite = true; - - // Try to find a assumed unique return value for the called function. - auto *RetCSAA = A.getAAFor<AAReturnedValuesImpl>(*this, *RV); - if (!RetCSAA) { - HasOverdefinedReturnedCalls = true; - LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned call site (" << *RV - << ") with " << (RetCSAA ? "invalid" : "no") - << " associated state\n"); + if (!CB->getCalledFunction()) { + LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB + << "\n"); + UnresolvedCalls.insert(CB); continue; } - // Try to find a assumed unique return value for the called function. - Optional<Value *> AssumedUniqueRV = RetCSAA->getAssumedUniqueReturnValue(); - - // If no assumed unique return value was found due to the lack of - // candidates, we may need to resolve more calls (through more update - // iterations) or the called function will not return. Either way, we simply - // stick with the call sites as return values. Because there were not - // multiple possibilities, we do not treat it as overdefined. - if (!AssumedUniqueRV.hasValue()) + // TODO: use the function scope once we have call site AAReturnedValues. + const auto &RetValAA = A.getAAFor<AAReturnedValues>( + *this, IRPosition::function(*CB->getCalledFunction())); + LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: " + << RetValAA << "\n"); + + // Skip dead ends, thus if we do not know anything about the returned + // call we mark it as unresolved and it will stay that way. + if (!RetValAA.getState().isValidState()) { + LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB + << "\n"); + UnresolvedCalls.insert(CB); continue; + } - // If multiple, non-refinable values were found, there cannot be a unique - // return value for the called function. The returned call is overdefined! - if (!AssumedUniqueRV.getValue()) { - HasOverdefinedReturnedCalls = true; - LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned call site has multiple " - "potentially returned values\n"); + // Do not try to learn partial information. If the callee has unresolved + // return values we will treat the call as unresolved/opaque. + auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls(); + if (!RetValAAUnresolvedCalls.empty()) { + UnresolvedCalls.insert(CB); continue; } - LLVM_DEBUG({ - bool UniqueRVIsKnown = RetCSAA->isAtFixpoint(); - dbgs() << "[AAReturnedValues] Returned call site " - << (UniqueRVIsKnown ? "known" : "assumed") - << " unique return value: " << *AssumedUniqueRV << "\n"; - }); + // Now check if we can track transitively returned values. If possible, thus + // if all return value can be represented in the current scope, do so. + bool Unresolved = false; + for (auto &RetValAAIt : RetValAA.returned_values()) { + Value *RetVal = RetValAAIt.first; + if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) || + isa<Constant>(RetVal)) + continue; + // Anything that did not fit in the above categories cannot be resolved, + // mark the call as unresolved. + LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value " + "cannot be translated: " + << *RetVal << "\n"); + UnresolvedCalls.insert(CB); + Unresolved = true; + break; + } - // The assumed unique return value. - Value *AssumedRetVal = AssumedUniqueRV.getValue(); - - // If the assumed unique return value is an argument, lookup the matching - // call site operand and recursively collect new returned values. - // If it is not an argument, it is just put into the set of returned values - // as we would have already looked through casts, phis, and similar values. - if (Argument *AssumedRetArg = dyn_cast<Argument>(AssumedRetVal)) - collectValuesRecursively(A, - RetCS.getArgOperand(AssumedRetArg->getArgNo()), - ReturnInsts, AddRVs); - else - AddRVs[AssumedRetVal].insert(ReturnInsts.begin(), ReturnInsts.end()); - } + if (Unresolved) + continue; - // Keep track of any change to trigger updates on dependent attributes. - ChangeStatus Changed = ChangeStatus::UNCHANGED; + // Now track transitively returned values. + unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB]; + if (NumRetAA == RetValAA.getNumReturnValues()) { + LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not " + "changed since it was seen last\n"); + continue; + } + NumRetAA = RetValAA.getNumReturnValues(); + + for (auto &RetValAAIt : RetValAA.returned_values()) { + Value *RetVal = RetValAAIt.first; + if (Argument *Arg = dyn_cast<Argument>(RetVal)) { + // Arguments are mapped to call site operands and we begin the traversal + // again. + bool Unused = false; + RVState RVS({NewRVsMap, Unused, RetValAAIt.second}); + VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS); + continue; + } else if (isa<CallBase>(RetVal)) { + // Call sites are resolved by the callee attribute over time, no need to + // do anything for us. + continue; + } else if (isa<Constant>(RetVal)) { + // Constants are valid everywhere, we can simply take them. + NewRVsMap[RetVal].insert(It.second.begin(), It.second.end()); + continue; + } + } + } - for (auto &It : AddRVs) { + // To avoid modifications to the ReturnedValues map while we iterate over it + // we kept record of potential new entries in a copy map, NewRVsMap. + for (auto &It : NewRVsMap) { assert(!It.second.empty() && "Entry does not add anything."); auto &ReturnInsts = ReturnedValues[It.first]; for (ReturnInst *RI : It.second) - if (ReturnInsts.insert(RI).second) { + if (ReturnInsts.insert(RI)) { LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value " << *It.first << " => " << *RI << "\n"); - Changed = ChangeStatus::CHANGED; + Changed = true; } } - // If there is no call site in the returned values we are done. - if (!HasCallSite) { - indicateOptimisticFixpoint(); - return ChangeStatus::CHANGED; - } - - return Changed; + Changed |= (NumUnresolvedCalls != UnresolvedCalls.size()); + return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; } -/// ------------------------ NoSync Function Attribute ------------------------- +struct AAReturnedValuesFunction final : public AAReturnedValuesImpl { + AAReturnedValuesFunction(const IRPosition &IRP) : AAReturnedValuesImpl(IRP) {} -struct AANoSyncFunction : AANoSync, BooleanState { + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) } +}; - AANoSyncFunction(Function &F, InformationCache &InfoCache) - : AANoSync(F, InfoCache) {} +/// Returned values information for a call sites. +struct AAReturnedValuesCallSite final : AAReturnedValuesImpl { + AAReturnedValuesCallSite(const IRPosition &IRP) : AAReturnedValuesImpl(IRP) {} - /// See AbstractAttribute::getState() - /// { - AbstractState &getState() override { return *this; } - const AbstractState &getState() const override { return *this; } - /// } + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites instead of + // redirecting requests to the callee. + llvm_unreachable("Abstract attributes for returned values are not " + "supported for call sites yet!"); + } - /// See AbstractAttribute::getManifestPosition(). - ManifestPosition getManifestPosition() const override { return MP_FUNCTION; } + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + return indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} +}; + +/// ------------------------ NoSync Function Attribute ------------------------- + +struct AANoSyncImpl : AANoSync { + AANoSyncImpl(const IRPosition &IRP) : AANoSync(IRP) {} const std::string getAsStr() const override { return getAssumed() ? "nosync" : "may-sync"; @@ -773,12 +1372,6 @@ struct AANoSyncFunction : AANoSync, BooleanState { /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override; - /// See AANoSync::isAssumedNoSync() - bool isAssumedNoSync() const override { return getAssumed(); } - - /// See AANoSync::isKnownNoSync() - bool isKnownNoSync() const override { return getKnown(); } - /// Helper function used to determine whether an instruction is non-relaxed /// atomic. In other words, if an atomic instruction does not have unordered /// or monotonic ordering @@ -792,7 +1385,7 @@ struct AANoSyncFunction : AANoSync, BooleanState { static bool isNoSyncIntrinsic(Instruction *I); }; -bool AANoSyncFunction::isNonRelaxedAtomic(Instruction *I) { +bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) { if (!I->isAtomic()) return false; @@ -841,7 +1434,7 @@ bool AANoSyncFunction::isNonRelaxedAtomic(Instruction *I) { /// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics. /// FIXME: We should ipmrove the handling of intrinsics. -bool AANoSyncFunction::isNoSyncIntrinsic(Instruction *I) { +bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) { if (auto *II = dyn_cast<IntrinsicInst>(I)) { switch (II->getIntrinsicID()) { /// Element wise atomic memory intrinsics are can only be unordered, @@ -863,7 +1456,7 @@ bool AANoSyncFunction::isNoSyncIntrinsic(Instruction *I) { return false; } -bool AANoSyncFunction::isVolatile(Instruction *I) { +bool AANoSyncImpl::isVolatile(Instruction *I) { assert(!ImmutableCallSite(I) && !isa<CallBase>(I) && "Calls should not be checked here"); @@ -881,482 +1474,4377 @@ bool AANoSyncFunction::isVolatile(Instruction *I) { } } -ChangeStatus AANoSyncFunction::updateImpl(Attributor &A) { - Function &F = getAnchorScope(); +ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) { - /// We are looking for volatile instructions or Non-Relaxed atomics. - /// FIXME: We should ipmrove the handling of intrinsics. - for (Instruction *I : InfoCache.getReadOrWriteInstsForFunction(F)) { - ImmutableCallSite ICS(I); - auto *NoSyncAA = A.getAAFor<AANoSyncFunction>(*this, *I); + auto CheckRWInstForNoSync = [&](Instruction &I) { + /// We are looking for volatile instructions or Non-Relaxed atomics. + /// FIXME: We should improve the handling of intrinsics. - if (isa<IntrinsicInst>(I) && isNoSyncIntrinsic(I)) - continue; + if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I)) + return true; - if (ICS && (!NoSyncAA || !NoSyncAA->isAssumedNoSync()) && - !ICS.hasFnAttr(Attribute::NoSync)) { + if (ImmutableCallSite ICS = ImmutableCallSite(&I)) { + if (ICS.hasFnAttr(Attribute::NoSync)) + return true; + + const auto &NoSyncAA = + A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(ICS)); + if (NoSyncAA.isAssumedNoSync()) + return true; + return false; + } + + if (!isVolatile(&I) && !isNonRelaxedAtomic(&I)) + return true; + + return false; + }; + + auto CheckForNoSync = [&](Instruction &I) { + // At this point we handled all read/write effects and they are all + // nosync, so they can be skipped. + if (I.mayReadOrWriteMemory()) + return true; + + // non-convergent and readnone imply nosync. + return !ImmutableCallSite(&I).isConvergent(); + }; + + if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) || + !A.checkForAllCallLikeInstructions(CheckForNoSync, *this)) + return indicatePessimisticFixpoint(); + + return ChangeStatus::UNCHANGED; +} + +struct AANoSyncFunction final : public AANoSyncImpl { + AANoSyncFunction(const IRPosition &IRP) : AANoSyncImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) } +}; + +/// NoSync attribute deduction for a call sites. +struct AANoSyncCallSite final : AANoSyncImpl { + AANoSyncCallSite(const IRPosition &IRP) : AANoSyncImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoSyncImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F) indicatePessimisticFixpoint(); - return ChangeStatus::CHANGED; + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos); + return clampStateAndIndicateChange( + getState(), static_cast<const AANoSync::StateType &>(FnAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); } +}; + +/// ------------------------ No-Free Attributes ---------------------------- + +struct AANoFreeImpl : public AANoFree { + AANoFreeImpl(const IRPosition &IRP) : AANoFree(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + auto CheckForNoFree = [&](Instruction &I) { + ImmutableCallSite ICS(&I); + if (ICS.hasFnAttr(Attribute::NoFree)) + return true; + + const auto &NoFreeAA = + A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(ICS)); + return NoFreeAA.isAssumedNoFree(); + }; + + if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this)) + return indicatePessimisticFixpoint(); + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + return getAssumed() ? "nofree" : "may-free"; + } +}; + +struct AANoFreeFunction final : public AANoFreeImpl { + AANoFreeFunction(const IRPosition &IRP) : AANoFreeImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) } +}; + +/// NoFree attribute deduction for a call sites. +struct AANoFreeCallSite final : AANoFreeImpl { + AANoFreeCallSite(const IRPosition &IRP) : AANoFreeImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoFreeImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos); + return clampStateAndIndicateChange( + getState(), static_cast<const AANoFree::StateType &>(FnAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); } +}; + +/// NoFree attribute for floating values. +struct AANoFreeFloating : AANoFreeImpl { + AANoFreeFloating(const IRPosition &IRP) : AANoFreeImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)} + + /// See Abstract Attribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + const IRPosition &IRP = getIRPosition(); + + const auto &NoFreeAA = + A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP)); + if (NoFreeAA.isAssumedNoFree()) + return ChangeStatus::UNCHANGED; + + Value &AssociatedValue = getIRPosition().getAssociatedValue(); + auto Pred = [&](const Use &U, bool &Follow) -> bool { + Instruction *UserI = cast<Instruction>(U.getUser()); + if (auto *CB = dyn_cast<CallBase>(UserI)) { + if (CB->isBundleOperand(&U)) + return false; + if (!CB->isArgOperand(&U)) + return true; + unsigned ArgNo = CB->getArgOperandNo(&U); + + const auto &NoFreeArg = A.getAAFor<AANoFree>( + *this, IRPosition::callsite_argument(*CB, ArgNo)); + return NoFreeArg.isAssumedNoFree(); + } + + if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) || + isa<PHINode>(UserI) || isa<SelectInst>(UserI)) { + Follow = true; + return true; + } + + // Unknown user. + return false; + }; + if (!A.checkForAllUses(Pred, *this, AssociatedValue)) + return indicatePessimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } +}; + +/// NoFree attribute for a call site argument. +struct AANoFreeArgument final : AANoFreeFloating { + AANoFreeArgument(const IRPosition &IRP) : AANoFreeFloating(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) } +}; + +/// NoFree attribute for call site arguments. +struct AANoFreeCallSiteArgument final : AANoFreeFloating { + AANoFreeCallSiteArgument(const IRPosition &IRP) : AANoFreeFloating(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Argument *Arg = getAssociatedArgument(); + if (!Arg) + return indicatePessimisticFixpoint(); + const IRPosition &ArgPos = IRPosition::argument(*Arg); + auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos); + return clampStateAndIndicateChange( + getState(), static_cast<const AANoFree::StateType &>(ArgAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)}; +}; + +/// NoFree attribute for function return value. +struct AANoFreeReturned final : AANoFreeFloating { + AANoFreeReturned(const IRPosition &IRP) : AANoFreeFloating(IRP) { + llvm_unreachable("NoFree is not applicable to function returns!"); + } + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + llvm_unreachable("NoFree is not applicable to function returns!"); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + llvm_unreachable("NoFree is not applicable to function returns!"); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} +}; + +/// NoFree attribute deduction for a call site return value. +struct AANoFreeCallSiteReturned final : AANoFreeFloating { + AANoFreeCallSiteReturned(const IRPosition &IRP) : AANoFreeFloating(IRP) {} + + ChangeStatus manifest(Attributor &A) override { + return ChangeStatus::UNCHANGED; + } + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) } +}; + +/// ------------------------ NonNull Argument Attribute ------------------------ +static int64_t getKnownNonNullAndDerefBytesForUse( + Attributor &A, AbstractAttribute &QueryingAA, Value &AssociatedValue, + const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) { + TrackUse = false; + + const Value *UseV = U->get(); + if (!UseV->getType()->isPointerTy()) + return 0; + + Type *PtrTy = UseV->getType(); + const Function *F = I->getFunction(); + bool NullPointerIsDefined = + F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true; + const DataLayout &DL = A.getInfoCache().getDL(); + if (ImmutableCallSite ICS = ImmutableCallSite(I)) { + if (ICS.isBundleOperand(U)) + return 0; + + if (ICS.isCallee(U)) { + IsNonNull |= !NullPointerIsDefined; + return 0; } - if (ICS) - continue; + unsigned ArgNo = ICS.getArgumentNo(U); + IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo); + // As long as we only use known information there is no need to track + // dependences here. + auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP, + /* TrackDependence */ false); + IsNonNull |= DerefAA.isKnownNonNull(); + return DerefAA.getKnownDereferenceableBytes(); + } - if (!isVolatile(I) && !isNonRelaxedAtomic(I)) - continue; + // We need to follow common pointer manipulation uses to the accesses they + // feed into. We can try to be smart to avoid looking through things we do not + // like for now, e.g., non-inbounds GEPs. + if (isa<CastInst>(I)) { + TrackUse = true; + return 0; + } + if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) + if (GEP->hasAllConstantIndices()) { + TrackUse = true; + return 0; + } + + int64_t Offset; + if (const Value *Base = getBasePointerOfAccessPointerOperand(I, Offset, DL)) { + if (Base == &AssociatedValue && + Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) { + int64_t DerefBytes = + (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset; + IsNonNull |= !NullPointerIsDefined; + return std::max(int64_t(0), DerefBytes); + } + } + + /// Corner case when an offset is 0. + if (const Value *Base = getBasePointerOfAccessPointerOperand( + I, Offset, DL, /*AllowNonInbounds*/ true)) { + if (Offset == 0 && Base == &AssociatedValue && + Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) { + int64_t DerefBytes = + (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()); + IsNonNull |= !NullPointerIsDefined; + return std::max(int64_t(0), DerefBytes); + } + } + + return 0; +} + +struct AANonNullImpl : AANonNull { + AANonNullImpl(const IRPosition &IRP) + : AANonNull(IRP), + NullIsDefined(NullPointerIsDefined( + getAnchorScope(), + getAssociatedValue().getType()->getPointerAddressSpace())) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (!NullIsDefined && + hasAttr({Attribute::NonNull, Attribute::Dereferenceable})) + indicateOptimisticFixpoint(); + else if (isa<ConstantPointerNull>(getAssociatedValue())) + indicatePessimisticFixpoint(); + else + AANonNull::initialize(A); + } + + /// See AAFromMustBeExecutedContext + bool followUse(Attributor &A, const Use *U, const Instruction *I) { + bool IsNonNull = false; + bool TrackUse = false; + getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I, + IsNonNull, TrackUse); + setKnown(IsNonNull); + return TrackUse; + } + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + return getAssumed() ? "nonnull" : "may-null"; + } + + /// Flag to determine if the underlying value can be null and still allow + /// valid accesses. + const bool NullIsDefined; +}; + +/// NonNull attribute for a floating value. +struct AANonNullFloating + : AAFromMustBeExecutedContext<AANonNull, AANonNullImpl> { + using Base = AAFromMustBeExecutedContext<AANonNull, AANonNullImpl>; + AANonNullFloating(const IRPosition &IRP) : Base(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Change = Base::updateImpl(A); + if (isKnownNonNull()) + return Change; + + if (!NullIsDefined) { + const auto &DerefAA = + A.getAAFor<AADereferenceable>(*this, getIRPosition()); + if (DerefAA.getAssumedDereferenceableBytes()) + return Change; + } + + const DataLayout &DL = A.getDataLayout(); + + DominatorTree *DT = nullptr; + InformationCache &InfoCache = A.getInfoCache(); + if (const Function *Fn = getAnchorScope()) + DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn); + + auto VisitValueCB = [&](Value &V, AANonNull::StateType &T, + bool Stripped) -> bool { + const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V)); + if (!Stripped && this == &AA) { + if (!isKnownNonZero(&V, DL, 0, /* TODO: AC */ nullptr, getCtxI(), DT)) + T.indicatePessimisticFixpoint(); + } else { + // Use abstract attribute information. + const AANonNull::StateType &NS = + static_cast<const AANonNull::StateType &>(AA.getState()); + T ^= NS; + } + return T.isValidState(); + }; + + StateType T; + if (!genericValueTraversal<AANonNull, StateType>(A, getIRPosition(), *this, + T, VisitValueCB)) + return indicatePessimisticFixpoint(); + + return clampStateAndIndicateChange(getState(), T); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) } +}; + +/// NonNull attribute for function return value. +struct AANonNullReturned final + : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl> { + AANonNullReturned(const IRPosition &IRP) + : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl>(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) } +}; + +/// NonNull attribute for function argument. +struct AANonNullArgument final + : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AANonNull, + AANonNullImpl> { + AANonNullArgument(const IRPosition &IRP) + : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AANonNull, + AANonNullImpl>( + IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) } +}; + +struct AANonNullCallSiteArgument final : AANonNullFloating { + AANonNullCallSiteArgument(const IRPosition &IRP) : AANonNullFloating(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) } +}; + +/// NonNull attribute for a call site return position. +struct AANonNullCallSiteReturned final + : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AANonNull, + AANonNullImpl> { + AANonNullCallSiteReturned(const IRPosition &IRP) + : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AANonNull, + AANonNullImpl>( + IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) } +}; + +/// ------------------------ No-Recurse Attributes ---------------------------- + +struct AANoRecurseImpl : public AANoRecurse { + AANoRecurseImpl(const IRPosition &IRP) : AANoRecurse(IRP) {} + + /// See AbstractAttribute::getAsStr() + const std::string getAsStr() const override { + return getAssumed() ? "norecurse" : "may-recurse"; + } +}; + +struct AANoRecurseFunction final : AANoRecurseImpl { + AANoRecurseFunction(const IRPosition &IRP) : AANoRecurseImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoRecurseImpl::initialize(A); + if (const Function *F = getAnchorScope()) + if (A.getInfoCache().getSccSize(*F) == 1) + return; indicatePessimisticFixpoint(); - return ChangeStatus::CHANGED; } - auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F); - auto Opcodes = {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr, - (unsigned)Instruction::Call}; + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { - for (unsigned Opcode : Opcodes) { - for (Instruction *I : OpcodeInstMap[Opcode]) { - // At this point we handled all read/write effects and they are all - // nosync, so they can be skipped. - if (I->mayReadOrWriteMemory()) - continue; + auto CheckForNoRecurse = [&](Instruction &I) { + ImmutableCallSite ICS(&I); + if (ICS.hasFnAttr(Attribute::NoRecurse)) + return true; - ImmutableCallSite ICS(I); + const auto &NoRecurseAA = + A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(ICS)); + if (!NoRecurseAA.isAssumedNoRecurse()) + return false; - // non-convergent and readnone imply nosync. - if (!ICS.isConvergent()) - continue; + // Recursion to the same function + if (ICS.getCalledFunction() == getAnchorScope()) + return false; + + return true; + }; + if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this)) + return indicatePessimisticFixpoint(); + return ChangeStatus::UNCHANGED; + } + + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) } +}; + +/// NoRecurse attribute deduction for a call sites. +struct AANoRecurseCallSite final : AANoRecurseImpl { + AANoRecurseCallSite(const IRPosition &IRP) : AANoRecurseImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoRecurseImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F) indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos); + return clampStateAndIndicateChange( + getState(), + static_cast<const AANoRecurse::StateType &>(FnAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); } +}; + +/// -------------------- Undefined-Behavior Attributes ------------------------ + +struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior { + AAUndefinedBehaviorImpl(const IRPosition &IRP) : AAUndefinedBehavior(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + // through a pointer (i.e. also branches etc.) + ChangeStatus updateImpl(Attributor &A) override { + const size_t UBPrevSize = KnownUBInsts.size(); + const size_t NoUBPrevSize = AssumedNoUBInsts.size(); + + auto InspectMemAccessInstForUB = [&](Instruction &I) { + // Skip instructions that are already saved. + if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I)) + return true; + + // If we reach here, we know we have an instruction + // that accesses memory through a pointer operand, + // for which getPointerOperand() should give it to us. + const Value *PtrOp = + Attributor::getPointerOperand(&I, /* AllowVolatile */ true); + assert(PtrOp && + "Expected pointer operand of memory accessing instruction"); + + // A memory access through a pointer is considered UB + // only if the pointer has constant null value. + // TODO: Expand it to not only check constant values. + if (!isa<ConstantPointerNull>(PtrOp)) { + AssumedNoUBInsts.insert(&I); + return true; + } + const Type *PtrTy = PtrOp->getType(); + + // Because we only consider instructions inside functions, + // assume that a parent function exists. + const Function *F = I.getFunction(); + + // A memory access using constant null pointer is only considered UB + // if null pointer is _not_ defined for the target platform. + if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace())) + AssumedNoUBInsts.insert(&I); + else + KnownUBInsts.insert(&I); + return true; + }; + + auto InspectBrInstForUB = [&](Instruction &I) { + // A conditional branch instruction is considered UB if it has `undef` + // condition. + + // Skip instructions that are already saved. + if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I)) + return true; + + // We know we have a branch instruction. + auto BrInst = cast<BranchInst>(&I); + + // Unconditional branches are never considered UB. + if (BrInst->isUnconditional()) + return true; + + // Either we stopped and the appropriate action was taken, + // or we got back a simplified value to continue. + Optional<Value *> SimplifiedCond = + stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst); + if (!SimplifiedCond.hasValue()) + return true; + AssumedNoUBInsts.insert(&I); + return true; + }; + + A.checkForAllInstructions(InspectMemAccessInstForUB, *this, + {Instruction::Load, Instruction::Store, + Instruction::AtomicCmpXchg, + Instruction::AtomicRMW}); + A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br}); + if (NoUBPrevSize != AssumedNoUBInsts.size() || + UBPrevSize != KnownUBInsts.size()) return ChangeStatus::CHANGED; + return ChangeStatus::UNCHANGED; + } + + bool isKnownToCauseUB(Instruction *I) const override { + return KnownUBInsts.count(I); + } + + bool isAssumedToCauseUB(Instruction *I) const override { + // In simple words, if an instruction is not in the assumed to _not_ + // cause UB, then it is assumed UB (that includes those + // in the KnownUBInsts set). The rest is boilerplate + // is to ensure that it is one of the instructions we test + // for UB. + + switch (I->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + case Instruction::AtomicCmpXchg: + case Instruction::AtomicRMW: + return !AssumedNoUBInsts.count(I); + case Instruction::Br: { + auto BrInst = cast<BranchInst>(I); + if (BrInst->isUnconditional()) + return false; + return !AssumedNoUBInsts.count(I); + } break; + default: + return false; } + return false; } - return ChangeStatus::UNCHANGED; + ChangeStatus manifest(Attributor &A) override { + if (KnownUBInsts.empty()) + return ChangeStatus::UNCHANGED; + for (Instruction *I : KnownUBInsts) + A.changeToUnreachableAfterManifest(I); + return ChangeStatus::CHANGED; + } + + /// See AbstractAttribute::getAsStr() + const std::string getAsStr() const override { + return getAssumed() ? "undefined-behavior" : "no-ub"; + } + + /// Note: The correctness of this analysis depends on the fact that the + /// following 2 sets will stop changing after some point. + /// "Change" here means that their size changes. + /// The size of each set is monotonically increasing + /// (we only add items to them) and it is upper bounded by the number of + /// instructions in the processed function (we can never save more + /// elements in either set than this number). Hence, at some point, + /// they will stop increasing. + /// Consequently, at some point, both sets will have stopped + /// changing, effectively making the analysis reach a fixpoint. + + /// Note: These 2 sets are disjoint and an instruction can be considered + /// one of 3 things: + /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in + /// the KnownUBInsts set. + /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior + /// has a reason to assume it). + /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior + /// could not find a reason to assume or prove that it can cause UB, + /// hence it assumes it doesn't. We have a set for these instructions + /// so that we don't reprocess them in every update. + /// Note however that instructions in this set may cause UB. + +protected: + /// A set of all live instructions _known_ to cause UB. + SmallPtrSet<Instruction *, 8> KnownUBInsts; + +private: + /// A set of all the (live) instructions that are assumed to _not_ cause UB. + SmallPtrSet<Instruction *, 8> AssumedNoUBInsts; + + // Should be called on updates in which if we're processing an instruction + // \p I that depends on a value \p V, one of the following has to happen: + // - If the value is assumed, then stop. + // - If the value is known but undef, then consider it UB. + // - Otherwise, do specific processing with the simplified value. + // We return None in the first 2 cases to signify that an appropriate + // action was taken and the caller should stop. + // Otherwise, we return the simplified value that the caller should + // use for specific processing. + Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V, + Instruction *I) { + const auto &ValueSimplifyAA = + A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V)); + Optional<Value *> SimplifiedV = + ValueSimplifyAA.getAssumedSimplifiedValue(A); + if (!ValueSimplifyAA.isKnown()) { + // Don't depend on assumed values. + return llvm::None; + } + if (!SimplifiedV.hasValue()) { + // If it is known (which we tested above) but it doesn't have a value, + // then we can assume `undef` and hence the instruction is UB. + KnownUBInsts.insert(I); + return llvm::None; + } + Value *Val = SimplifiedV.getValue(); + if (isa<UndefValue>(Val)) { + KnownUBInsts.insert(I); + return llvm::None; + } + return Val; + } +}; + +struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl { + AAUndefinedBehaviorFunction(const IRPosition &IRP) + : AAUndefinedBehaviorImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECL(UndefinedBehaviorInstruction, Instruction, + "Number of instructions known to have UB"); + BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) += + KnownUBInsts.size(); + } +}; + +/// ------------------------ Will-Return Attributes ---------------------------- + +// Helper function that checks whether a function has any cycle. +// TODO: Replace with more efficent code +static bool containsCycle(Function &F) { + SmallPtrSet<BasicBlock *, 32> Visited; + + // Traverse BB by dfs and check whether successor is already visited. + for (BasicBlock *BB : depth_first(&F)) { + Visited.insert(BB); + for (auto *SuccBB : successors(BB)) { + if (Visited.count(SuccBB)) + return true; + } + } + return false; } -/// ------------------------ No-Free Attributes ---------------------------- +// Helper function that checks the function have a loop which might become an +// endless loop +// FIXME: Any cycle is regarded as endless loop for now. +// We have to allow some patterns. +static bool containsPossiblyEndlessLoop(Function *F) { + return !F || !F->hasExactDefinition() || containsCycle(*F); +} -struct AANoFreeFunction : AbstractAttribute, BooleanState { +struct AAWillReturnImpl : public AAWillReturn { + AAWillReturnImpl(const IRPosition &IRP) : AAWillReturn(IRP) {} - /// See AbstractAttribute::AbstractAttribute(...). - AANoFreeFunction(Function &F, InformationCache &InfoCache) - : AbstractAttribute(F, InfoCache) {} + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAWillReturn::initialize(A); - /// See AbstractAttribute::getState() - ///{ - AbstractState &getState() override { return *this; } - const AbstractState &getState() const override { return *this; } - ///} + Function *F = getAssociatedFunction(); + if (containsPossiblyEndlessLoop(F)) + indicatePessimisticFixpoint(); + } - /// See AbstractAttribute::getManifestPosition(). - ManifestPosition getManifestPosition() const override { return MP_FUNCTION; } + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + auto CheckForWillReturn = [&](Instruction &I) { + IRPosition IPos = IRPosition::callsite_function(ImmutableCallSite(&I)); + const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos); + if (WillReturnAA.isKnownWillReturn()) + return true; + if (!WillReturnAA.isAssumedWillReturn()) + return false; + const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos); + return NoRecurseAA.isAssumedNoRecurse(); + }; - /// See AbstractAttribute::getAsStr(). + if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this)) + return indicatePessimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::getAsStr() const std::string getAsStr() const override { - return getAssumed() ? "nofree" : "may-free"; + return getAssumed() ? "willreturn" : "may-noreturn"; + } +}; + +struct AAWillReturnFunction final : AAWillReturnImpl { + AAWillReturnFunction(const IRPosition &IRP) : AAWillReturnImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) } +}; + +/// WillReturn attribute deduction for a call sites. +struct AAWillReturnCallSite final : AAWillReturnImpl { + AAWillReturnCallSite(const IRPosition &IRP) : AAWillReturnImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAWillReturnImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F) + indicatePessimisticFixpoint(); } /// See AbstractAttribute::updateImpl(...). - ChangeStatus updateImpl(Attributor &A) override; + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos); + return clampStateAndIndicateChange( + getState(), + static_cast<const AAWillReturn::StateType &>(FnAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); } +}; - /// See AbstractAttribute::getAttrKind(). - Attribute::AttrKind getAttrKind() const override { return ID; } +/// -------------------AAReachability Attribute-------------------------- - /// Return true if "nofree" is assumed. - bool isAssumedNoFree() const { return getAssumed(); } +struct AAReachabilityImpl : AAReachability { + AAReachabilityImpl(const IRPosition &IRP) : AAReachability(IRP) {} - /// Return true if "nofree" is known. - bool isKnownNoFree() const { return getKnown(); } + const std::string getAsStr() const override { + // TODO: Return the number of reachable queries. + return "reachable"; + } + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { indicatePessimisticFixpoint(); } - /// The identifier used by the Attributor for this class of attributes. - static constexpr Attribute::AttrKind ID = Attribute::NoFree; + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + return indicatePessimisticFixpoint(); + } }; -ChangeStatus AANoFreeFunction::updateImpl(Attributor &A) { - Function &F = getAnchorScope(); +struct AAReachabilityFunction final : public AAReachabilityImpl { + AAReachabilityFunction(const IRPosition &IRP) : AAReachabilityImpl(IRP) {} - // The map from instruction opcodes to those instructions in the function. - auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F); + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); } +}; - for (unsigned Opcode : - {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr, - (unsigned)Instruction::Call}) { - for (Instruction *I : OpcodeInstMap[Opcode]) { +/// ------------------------ NoAlias Argument Attribute ------------------------ + +struct AANoAliasImpl : AANoAlias { + AANoAliasImpl(const IRPosition &IRP) : AANoAlias(IRP) {} + + const std::string getAsStr() const override { + return getAssumed() ? "noalias" : "may-alias"; + } +}; + +/// NoAlias attribute for a floating value. +struct AANoAliasFloating final : AANoAliasImpl { + AANoAliasFloating(const IRPosition &IRP) : AANoAliasImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoAliasImpl::initialize(A); + Value &Val = getAssociatedValue(); + if (isa<AllocaInst>(Val)) + indicateOptimisticFixpoint(); + if (isa<ConstantPointerNull>(Val) && + Val.getType()->getPointerAddressSpace() == 0) + indicateOptimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Implement this. + return indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FLOATING_ATTR(noalias) + } +}; + +/// NoAlias attribute for an argument. +struct AANoAliasArgument final + : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> { + using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>; + AANoAliasArgument(const IRPosition &IRP) : Base(IRP) {} + + /// See AbstractAttribute::update(...). + ChangeStatus updateImpl(Attributor &A) override { + // We have to make sure no-alias on the argument does not break + // synchronization when this is a callback argument, see also [1] below. + // If synchronization cannot be affected, we delegate to the base updateImpl + // function, otherwise we give up for now. + + // If the function is no-sync, no-alias cannot break synchronization. + const auto &NoSyncAA = A.getAAFor<AANoSync>( + *this, IRPosition::function_scope(getIRPosition())); + if (NoSyncAA.isAssumedNoSync()) + return Base::updateImpl(A); + + // If the argument is read-only, no-alias cannot break synchronization. + const auto &MemBehaviorAA = + A.getAAFor<AAMemoryBehavior>(*this, getIRPosition()); + if (MemBehaviorAA.isAssumedReadOnly()) + return Base::updateImpl(A); + + // If the argument is never passed through callbacks, no-alias cannot break + // synchronization. + if (A.checkForAllCallSites( + [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this, + true)) + return Base::updateImpl(A); + + // TODO: add no-alias but make sure it doesn't break synchronization by + // introducing fake uses. See: + // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel, + // International Workshop on OpenMP 2018, + // http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf + + return indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) } +}; + +struct AANoAliasCallSiteArgument final : AANoAliasImpl { + AANoAliasCallSiteArgument(const IRPosition &IRP) : AANoAliasImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + // See callsite argument attribute and callee argument attribute. + ImmutableCallSite ICS(&getAnchorValue()); + if (ICS.paramHasAttr(getArgNo(), Attribute::NoAlias)) + indicateOptimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // We can deduce "noalias" if the following conditions hold. + // (i) Associated value is assumed to be noalias in the definition. + // (ii) Associated value is assumed to be no-capture in all the uses + // possibly executed before this callsite. + // (iii) There is no other pointer argument which could alias with the + // value. + + const Value &V = getAssociatedValue(); + const IRPosition IRP = IRPosition::value(V); + + // (i) Check whether noalias holds in the definition. + + auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP); + LLVM_DEBUG(dbgs() << "[Attributor][AANoAliasCSArg] check definition: " << V + << " :: " << NoAliasAA << "\n"); + + if (!NoAliasAA.isAssumedNoAlias()) + return indicatePessimisticFixpoint(); + + LLVM_DEBUG(dbgs() << "[Attributor][AANoAliasCSArg] " << V + << " is assumed NoAlias in the definition\n"); + + // (ii) Check whether the value is captured in the scope using AANoCapture. + // FIXME: This is conservative though, it is better to look at CFG and + // check only uses possibly executed before this callsite. + + auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP); + if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) { + LLVM_DEBUG( + dbgs() << "[Attributor][AANoAliasCSArg] " << V + << " cannot be noalias as it is potentially captured\n"); + return indicatePessimisticFixpoint(); + } + + // (iii) Check there is no other pointer argument which could alias with the + // value. + // TODO: AbstractCallSite + ImmutableCallSite ICS(&getAnchorValue()); + for (unsigned i = 0; i < ICS.getNumArgOperands(); i++) { + if (getArgNo() == (int)i) + continue; + const Value *ArgOp = ICS.getArgOperand(i); + if (!ArgOp->getType()->isPointerTy()) + continue; + + if (const Function *F = getAnchorScope()) { + if (AAResults *AAR = A.getInfoCache().getAAResultsForFunction(*F)) { + bool IsAliasing = !AAR->isNoAlias(&getAssociatedValue(), ArgOp); + LLVM_DEBUG(dbgs() + << "[Attributor][NoAliasCSArg] Check alias between " + "callsite arguments " + << AAR->isNoAlias(&getAssociatedValue(), ArgOp) << " " + << getAssociatedValue() << " " << *ArgOp << " => " + << (IsAliasing ? "" : "no-") << "alias \n"); + + if (!IsAliasing) + continue; + } + } + return indicatePessimisticFixpoint(); + } + + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) } +}; + +/// NoAlias attribute for function return value. +struct AANoAliasReturned final : AANoAliasImpl { + AANoAliasReturned(const IRPosition &IRP) : AANoAliasImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + virtual ChangeStatus updateImpl(Attributor &A) override { - auto ICS = ImmutableCallSite(I); - auto *NoFreeAA = A.getAAFor<AANoFreeFunction>(*this, *I); + auto CheckReturnValue = [&](Value &RV) -> bool { + if (Constant *C = dyn_cast<Constant>(&RV)) + if (C->isNullValue() || isa<UndefValue>(C)) + return true; - if ((!NoFreeAA || !NoFreeAA->isAssumedNoFree()) && - !ICS.hasFnAttr(Attribute::NoFree)) { + /// For now, we can only deduce noalias if we have call sites. + /// FIXME: add more support. + ImmutableCallSite ICS(&RV); + if (!ICS) + return false; + + const IRPosition &RVPos = IRPosition::value(RV); + const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos); + if (!NoAliasAA.isAssumedNoAlias()) + return false; + + const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos); + return NoCaptureAA.isAssumedNoCaptureMaybeReturned(); + }; + + if (!A.checkForAllReturnedValues(CheckReturnValue, *this)) + return indicatePessimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) } +}; + +/// NoAlias attribute deduction for a call site return value. +struct AANoAliasCallSiteReturned final : AANoAliasImpl { + AANoAliasCallSiteReturned(const IRPosition &IRP) : AANoAliasImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoAliasImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::returned(*F); + auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos); + return clampStateAndIndicateChange( + getState(), static_cast<const AANoAlias::StateType &>(FnAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); } +}; + +/// -------------------AAIsDead Function Attribute----------------------- + +struct AAIsDeadValueImpl : public AAIsDead { + AAIsDeadValueImpl(const IRPosition &IRP) : AAIsDead(IRP) {} + + /// See AAIsDead::isAssumedDead(). + bool isAssumedDead() const override { return getAssumed(); } + + /// See AAIsDead::isAssumedDead(BasicBlock *). + bool isAssumedDead(const BasicBlock *BB) const override { return false; } + + /// See AAIsDead::isKnownDead(BasicBlock *). + bool isKnownDead(const BasicBlock *BB) const override { return false; } + + /// See AAIsDead::isAssumedDead(Instruction *I). + bool isAssumedDead(const Instruction *I) const override { + return I == getCtxI() && isAssumedDead(); + } + + /// See AAIsDead::isKnownDead(Instruction *I). + bool isKnownDead(const Instruction *I) const override { + return I == getCtxI() && getKnown(); + } + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + return isAssumedDead() ? "assumed-dead" : "assumed-live"; + } +}; + +struct AAIsDeadFloating : public AAIsDeadValueImpl { + AAIsDeadFloating(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (Instruction *I = dyn_cast<Instruction>(&getAssociatedValue())) + if (!wouldInstructionBeTriviallyDead(I)) indicatePessimisticFixpoint(); + if (isa<UndefValue>(getAssociatedValue())) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + auto UsePred = [&](const Use &U, bool &Follow) { + Instruction *UserI = cast<Instruction>(U.getUser()); + if (CallSite CS = CallSite(UserI)) { + if (!CS.isArgOperand(&U)) + return false; + const IRPosition &CSArgPos = + IRPosition::callsite_argument(CS, CS.getArgumentNo(&U)); + const auto &CSArgIsDead = A.getAAFor<AAIsDead>(*this, CSArgPos); + return CSArgIsDead.isAssumedDead(); + } + if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) { + const IRPosition &RetPos = IRPosition::returned(*RI->getFunction()); + const auto &RetIsDeadAA = A.getAAFor<AAIsDead>(*this, RetPos); + return RetIsDeadAA.isAssumedDead(); + } + Follow = true; + return wouldInstructionBeTriviallyDead(UserI); + }; + + if (!A.checkForAllUses(UsePred, *this, getAssociatedValue())) + return indicatePessimisticFixpoint(); + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + Value &V = getAssociatedValue(); + if (auto *I = dyn_cast<Instruction>(&V)) + if (wouldInstructionBeTriviallyDead(I)) { + A.deleteAfterManifest(*I); + return ChangeStatus::CHANGED; + } + + if (V.use_empty()) + return ChangeStatus::UNCHANGED; + + UndefValue &UV = *UndefValue::get(V.getType()); + bool AnyChange = A.changeValueAfterManifest(V, UV); + return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FLOATING_ATTR(IsDead) + } +}; + +struct AAIsDeadArgument : public AAIsDeadFloating { + AAIsDeadArgument(const IRPosition &IRP) : AAIsDeadFloating(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (!getAssociatedFunction()->hasExactDefinition()) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + ChangeStatus Changed = AAIsDeadFloating::manifest(A); + Argument &Arg = *getAssociatedArgument(); + if (Arg.getParent()->hasLocalLinkage()) + if (A.registerFunctionSignatureRewrite( + Arg, /* ReplacementTypes */ {}, + Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{}, + Attributor::ArgumentReplacementInfo::ACSRepairCBTy{})) return ChangeStatus::CHANGED; + return Changed; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) } +}; + +struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl { + AAIsDeadCallSiteArgument(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (isa<UndefValue>(getAssociatedValue())) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Argument *Arg = getAssociatedArgument(); + if (!Arg) + return indicatePessimisticFixpoint(); + const IRPosition &ArgPos = IRPosition::argument(*Arg); + auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos); + return clampStateAndIndicateChange( + getState(), static_cast<const AAIsDead::StateType &>(ArgAA.getState())); + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + CallBase &CB = cast<CallBase>(getAnchorValue()); + Use &U = CB.getArgOperandUse(getArgNo()); + assert(!isa<UndefValue>(U.get()) && + "Expected undef values to be filtered out!"); + UndefValue &UV = *UndefValue::get(U->getType()); + if (A.changeUseAfterManifest(U, UV)) + return ChangeStatus::CHANGED; + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) } +}; + +struct AAIsDeadReturned : public AAIsDeadValueImpl { + AAIsDeadReturned(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + + auto PredForCallSite = [&](AbstractCallSite ACS) { + if (ACS.isCallbackCall()) + return false; + const IRPosition &CSRetPos = + IRPosition::callsite_returned(ACS.getCallSite()); + const auto &RetIsDeadAA = A.getAAFor<AAIsDead>(*this, CSRetPos); + return RetIsDeadAA.isAssumedDead(); + }; + + if (!A.checkForAllCallSites(PredForCallSite, *this, true)) + return indicatePessimisticFixpoint(); + + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + // TODO: Rewrite the signature to return void? + bool AnyChange = false; + UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType()); + auto RetInstPred = [&](Instruction &I) { + ReturnInst &RI = cast<ReturnInst>(I); + if (!isa<UndefValue>(RI.getReturnValue())) + AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV); + return true; + }; + A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret}); + return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) } +}; + +struct AAIsDeadCallSiteReturned : public AAIsDeadFloating { + AAIsDeadCallSiteReturned(const IRPosition &IRP) : AAIsDeadFloating(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(IsDead) } +}; + +struct AAIsDeadFunction : public AAIsDead { + AAIsDeadFunction(const IRPosition &IRP) : AAIsDead(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + const Function *F = getAssociatedFunction(); + if (F && !F->isDeclaration()) { + ToBeExploredFrom.insert(&F->getEntryBlock().front()); + assumeLive(A, F->getEntryBlock()); + } + } + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" + + std::to_string(getAssociatedFunction()->size()) + "][#TBEP " + + std::to_string(ToBeExploredFrom.size()) + "][#KDE " + + std::to_string(KnownDeadEnds.size()) + "]"; + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + assert(getState().isValidState() && + "Attempted to manifest an invalid state!"); + + ChangeStatus HasChanged = ChangeStatus::UNCHANGED; + Function &F = *getAssociatedFunction(); + + if (AssumedLiveBlocks.empty()) { + A.deleteAfterManifest(F); + return ChangeStatus::CHANGED; + } + + // Flag to determine if we can change an invoke to a call assuming the + // callee is nounwind. This is not possible if the personality of the + // function allows to catch asynchronous exceptions. + bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F); + + KnownDeadEnds.set_union(ToBeExploredFrom); + for (const Instruction *DeadEndI : KnownDeadEnds) { + auto *CB = dyn_cast<CallBase>(DeadEndI); + if (!CB) + continue; + const auto &NoReturnAA = + A.getAAFor<AANoReturn>(*this, IRPosition::callsite_function(*CB)); + bool MayReturn = !NoReturnAA.isAssumedNoReturn(); + if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB))) + continue; + + if (auto *II = dyn_cast<InvokeInst>(DeadEndI)) + A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II)); + else + A.changeToUnreachableAfterManifest( + const_cast<Instruction *>(DeadEndI->getNextNode())); + HasChanged = ChangeStatus::CHANGED; + } + + for (BasicBlock &BB : F) + if (!AssumedLiveBlocks.count(&BB)) + A.deleteAfterManifest(BB); + + return HasChanged; + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override; + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + + /// Returns true if the function is assumed dead. + bool isAssumedDead() const override { return false; } + + /// See AAIsDead::isAssumedDead(BasicBlock *). + bool isAssumedDead(const BasicBlock *BB) const override { + assert(BB->getParent() == getAssociatedFunction() && + "BB must be in the same anchor scope function."); + + if (!getAssumed()) + return false; + return !AssumedLiveBlocks.count(BB); + } + + /// See AAIsDead::isKnownDead(BasicBlock *). + bool isKnownDead(const BasicBlock *BB) const override { + return getKnown() && isAssumedDead(BB); + } + + /// See AAIsDead::isAssumed(Instruction *I). + bool isAssumedDead(const Instruction *I) const override { + assert(I->getParent()->getParent() == getAssociatedFunction() && + "Instruction must be in the same anchor scope function."); + + if (!getAssumed()) + return false; + + // If it is not in AssumedLiveBlocks then it for sure dead. + // Otherwise, it can still be after noreturn call in a live block. + if (!AssumedLiveBlocks.count(I->getParent())) + return true; + + // If it is not after a liveness barrier it is live. + const Instruction *PrevI = I->getPrevNode(); + while (PrevI) { + if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI)) + return true; + PrevI = PrevI->getPrevNode(); + } + return false; + } + + /// See AAIsDead::isKnownDead(Instruction *I). + bool isKnownDead(const Instruction *I) const override { + return getKnown() && isAssumedDead(I); + } + + /// Determine if \p F might catch asynchronous exceptions. + static bool mayCatchAsynchronousExceptions(const Function &F) { + return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F); + } + + /// Assume \p BB is (partially) live now and indicate to the Attributor \p A + /// that internal function called from \p BB should now be looked at. + bool assumeLive(Attributor &A, const BasicBlock &BB) { + if (!AssumedLiveBlocks.insert(&BB).second) + return false; + + // We assume that all of BB is (probably) live now and if there are calls to + // internal functions we will assume that those are now live as well. This + // is a performance optimization for blocks with calls to a lot of internal + // functions. It can however cause dead functions to be treated as live. + for (const Instruction &I : BB) + if (ImmutableCallSite ICS = ImmutableCallSite(&I)) + if (const Function *F = ICS.getCalledFunction()) + if (F->hasLocalLinkage()) + A.markLiveInternalFunction(*F); + return true; + } + + /// Collection of instructions that need to be explored again, e.g., we + /// did assume they do not transfer control to (one of their) successors. + SmallSetVector<const Instruction *, 8> ToBeExploredFrom; + + /// Collection of instructions that are known to not transfer control. + SmallSetVector<const Instruction *, 8> KnownDeadEnds; + + /// Collection of all assumed live BasicBlocks. + DenseSet<const BasicBlock *> AssumedLiveBlocks; +}; + +static bool +identifyAliveSuccessors(Attributor &A, const CallBase &CB, + AbstractAttribute &AA, + SmallVectorImpl<const Instruction *> &AliveSuccessors) { + const IRPosition &IPos = IRPosition::callsite_function(CB); + + const auto &NoReturnAA = A.getAAFor<AANoReturn>(AA, IPos); + if (NoReturnAA.isAssumedNoReturn()) + return !NoReturnAA.isKnownNoReturn(); + if (CB.isTerminator()) + AliveSuccessors.push_back(&CB.getSuccessor(0)->front()); + else + AliveSuccessors.push_back(CB.getNextNode()); + return false; +} + +static bool +identifyAliveSuccessors(Attributor &A, const InvokeInst &II, + AbstractAttribute &AA, + SmallVectorImpl<const Instruction *> &AliveSuccessors) { + bool UsedAssumedInformation = + identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors); + + // First, determine if we can change an invoke to a call assuming the + // callee is nounwind. This is not possible if the personality of the + // function allows to catch asynchronous exceptions. + if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) { + AliveSuccessors.push_back(&II.getUnwindDest()->front()); + } else { + const IRPosition &IPos = IRPosition::callsite_function(II); + const auto &AANoUnw = A.getAAFor<AANoUnwind>(AA, IPos); + if (AANoUnw.isAssumedNoUnwind()) { + UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind(); + } else { + AliveSuccessors.push_back(&II.getUnwindDest()->front()); + } + } + return UsedAssumedInformation; +} + +static Optional<ConstantInt *> +getAssumedConstant(Attributor &A, const Value &V, AbstractAttribute &AA, + bool &UsedAssumedInformation) { + const auto &ValueSimplifyAA = + A.getAAFor<AAValueSimplify>(AA, IRPosition::value(V)); + Optional<Value *> SimplifiedV = ValueSimplifyAA.getAssumedSimplifiedValue(A); + UsedAssumedInformation |= !ValueSimplifyAA.isKnown(); + if (!SimplifiedV.hasValue()) + return llvm::None; + if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) + return llvm::None; + return dyn_cast_or_null<ConstantInt>(SimplifiedV.getValue()); +} + +static bool +identifyAliveSuccessors(Attributor &A, const BranchInst &BI, + AbstractAttribute &AA, + SmallVectorImpl<const Instruction *> &AliveSuccessors) { + bool UsedAssumedInformation = false; + if (BI.getNumSuccessors() == 1) { + AliveSuccessors.push_back(&BI.getSuccessor(0)->front()); + } else { + Optional<ConstantInt *> CI = + getAssumedConstant(A, *BI.getCondition(), AA, UsedAssumedInformation); + if (!CI.hasValue()) { + // No value yet, assume both edges are dead. + } else if (CI.getValue()) { + const BasicBlock *SuccBB = + BI.getSuccessor(1 - CI.getValue()->getZExtValue()); + AliveSuccessors.push_back(&SuccBB->front()); + } else { + AliveSuccessors.push_back(&BI.getSuccessor(0)->front()); + AliveSuccessors.push_back(&BI.getSuccessor(1)->front()); + UsedAssumedInformation = false; + } + } + return UsedAssumedInformation; +} + +static bool +identifyAliveSuccessors(Attributor &A, const SwitchInst &SI, + AbstractAttribute &AA, + SmallVectorImpl<const Instruction *> &AliveSuccessors) { + bool UsedAssumedInformation = false; + Optional<ConstantInt *> CI = + getAssumedConstant(A, *SI.getCondition(), AA, UsedAssumedInformation); + if (!CI.hasValue()) { + // No value yet, assume all edges are dead. + } else if (CI.getValue()) { + for (auto &CaseIt : SI.cases()) { + if (CaseIt.getCaseValue() == CI.getValue()) { + AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front()); + return UsedAssumedInformation; } } + AliveSuccessors.push_back(&SI.getDefaultDest()->front()); + return UsedAssumedInformation; + } else { + for (const BasicBlock *SuccBB : successors(SI.getParent())) + AliveSuccessors.push_back(&SuccBB->front()); } - return ChangeStatus::UNCHANGED; + return UsedAssumedInformation; } -/// ------------------------ NonNull Argument Attribute ------------------------ -struct AANonNullImpl : AANonNull, BooleanState { +ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) { + ChangeStatus Change = ChangeStatus::UNCHANGED; + + LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/" + << getAssociatedFunction()->size() << "] BBs and " + << ToBeExploredFrom.size() << " exploration points and " + << KnownDeadEnds.size() << " known dead ends\n"); - AANonNullImpl(Value &V, InformationCache &InfoCache) - : AANonNull(V, InfoCache) {} + // Copy and clear the list of instructions we need to explore from. It is + // refilled with instructions the next update has to look at. + SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(), + ToBeExploredFrom.end()); + decltype(ToBeExploredFrom) NewToBeExploredFrom; - AANonNullImpl(Value *AssociatedVal, Value &AnchoredValue, - InformationCache &InfoCache) - : AANonNull(AssociatedVal, AnchoredValue, InfoCache) {} + SmallVector<const Instruction *, 8> AliveSuccessors; + while (!Worklist.empty()) { + const Instruction *I = Worklist.pop_back_val(); + LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n"); + + AliveSuccessors.clear(); + + bool UsedAssumedInformation = false; + switch (I->getOpcode()) { + // TODO: look for (assumed) UB to backwards propagate "deadness". + default: + if (I->isTerminator()) { + for (const BasicBlock *SuccBB : successors(I->getParent())) + AliveSuccessors.push_back(&SuccBB->front()); + } else { + AliveSuccessors.push_back(I->getNextNode()); + } + break; + case Instruction::Call: + UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I), + *this, AliveSuccessors); + break; + case Instruction::Invoke: + UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I), + *this, AliveSuccessors); + break; + case Instruction::Br: + UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I), + *this, AliveSuccessors); + break; + case Instruction::Switch: + UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I), + *this, AliveSuccessors); + break; + } + + if (UsedAssumedInformation) { + NewToBeExploredFrom.insert(I); + } else { + Change = ChangeStatus::CHANGED; + if (AliveSuccessors.empty() || + (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors())) + KnownDeadEnds.insert(I); + } + + LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: " + << AliveSuccessors.size() << " UsedAssumedInformation: " + << UsedAssumedInformation << "\n"); + + for (const Instruction *AliveSuccessor : AliveSuccessors) { + if (!I->isTerminator()) { + assert(AliveSuccessors.size() == 1 && + "Non-terminator expected to have a single successor!"); + Worklist.push_back(AliveSuccessor); + } else { + if (assumeLive(A, *AliveSuccessor->getParent())) + Worklist.push_back(AliveSuccessor); + } + } + } + + ToBeExploredFrom = std::move(NewToBeExploredFrom); + + // If we know everything is live there is no need to query for liveness. + // Instead, indicating a pessimistic fixpoint will cause the state to be + // "invalid" and all queries to be answered conservatively without lookups. + // To be in this state we have to (1) finished the exploration and (3) not + // discovered any non-trivial dead end and (2) not ruled unreachable code + // dead. + if (ToBeExploredFrom.empty() && + getAssociatedFunction()->size() == AssumedLiveBlocks.size() && + llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) { + return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0; + })) + return indicatePessimisticFixpoint(); + return Change; +} + +/// Liveness information for a call sites. +struct AAIsDeadCallSite final : AAIsDeadFunction { + AAIsDeadCallSite(const IRPosition &IRP) : AAIsDeadFunction(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites instead of + // redirecting requests to the callee. + llvm_unreachable("Abstract attributes for liveness are not " + "supported for call sites yet!"); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + return indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} +}; + +/// -------------------- Dereferenceable Argument Attribute -------------------- + +template <> +ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S, + const DerefState &R) { + ChangeStatus CS0 = + clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState); + ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState); + return CS0 | CS1; +} + +struct AADereferenceableImpl : AADereferenceable { + AADereferenceableImpl(const IRPosition &IRP) : AADereferenceable(IRP) {} + using StateType = DerefState; + + void initialize(Attributor &A) override { + SmallVector<Attribute, 4> Attrs; + getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull}, + Attrs); + for (const Attribute &Attr : Attrs) + takeKnownDerefBytesMaximum(Attr.getValueAsInt()); + + NonNullAA = &A.getAAFor<AANonNull>(*this, getIRPosition()); + + const IRPosition &IRP = this->getIRPosition(); + bool IsFnInterface = IRP.isFnInterfaceKind(); + const Function *FnScope = IRP.getAnchorScope(); + if (IsFnInterface && (!FnScope || !FnScope->hasExactDefinition())) + indicatePessimisticFixpoint(); + } /// See AbstractAttribute::getState() /// { - AbstractState &getState() override { return *this; } - const AbstractState &getState() const override { return *this; } + StateType &getState() override { return *this; } + const StateType &getState() const override { return *this; } /// } + /// Helper function for collecting accessed bytes in must-be-executed-context + void addAccessedBytesForUse(Attributor &A, const Use *U, + const Instruction *I) { + const Value *UseV = U->get(); + if (!UseV->getType()->isPointerTy()) + return; + + Type *PtrTy = UseV->getType(); + const DataLayout &DL = A.getDataLayout(); + int64_t Offset; + if (const Value *Base = getBasePointerOfAccessPointerOperand( + I, Offset, DL, /*AllowNonInbounds*/ true)) { + if (Base == &getAssociatedValue() && + Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) { + uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType()); + addAccessedBytes(Offset, Size); + } + } + return; + } + + /// See AAFromMustBeExecutedContext + bool followUse(Attributor &A, const Use *U, const Instruction *I) { + bool IsNonNull = false; + bool TrackUse = false; + int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse( + A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse); + + addAccessedBytesForUse(A, U, I); + takeKnownDerefBytesMaximum(DerefBytes); + return TrackUse; + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + ChangeStatus Change = AADereferenceable::manifest(A); + if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) { + removeAttrs({Attribute::DereferenceableOrNull}); + return ChangeStatus::CHANGED; + } + return Change; + } + + void getDeducedAttributes(LLVMContext &Ctx, + SmallVectorImpl<Attribute> &Attrs) const override { + // TODO: Add *_globally support + if (isAssumedNonNull()) + Attrs.emplace_back(Attribute::getWithDereferenceableBytes( + Ctx, getAssumedDereferenceableBytes())); + else + Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes( + Ctx, getAssumedDereferenceableBytes())); + } + /// See AbstractAttribute::getAsStr(). const std::string getAsStr() const override { - return getAssumed() ? "nonnull" : "may-null"; + if (!getAssumedDereferenceableBytes()) + return "unknown-dereferenceable"; + return std::string("dereferenceable") + + (isAssumedNonNull() ? "" : "_or_null") + + (isAssumedGlobal() ? "_globally" : "") + "<" + + std::to_string(getKnownDereferenceableBytes()) + "-" + + std::to_string(getAssumedDereferenceableBytes()) + ">"; } +}; - /// See AANonNull::isAssumedNonNull(). - bool isAssumedNonNull() const override { return getAssumed(); } +/// Dereferenceable attribute for a floating value. +struct AADereferenceableFloating + : AAFromMustBeExecutedContext<AADereferenceable, AADereferenceableImpl> { + using Base = + AAFromMustBeExecutedContext<AADereferenceable, AADereferenceableImpl>; + AADereferenceableFloating(const IRPosition &IRP) : Base(IRP) {} - /// See AANonNull::isKnownNonNull(). - bool isKnownNonNull() const override { return getKnown(); } + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Change = Base::updateImpl(A); + + const DataLayout &DL = A.getDataLayout(); + + auto VisitValueCB = [&](Value &V, DerefState &T, bool Stripped) -> bool { + unsigned IdxWidth = + DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace()); + APInt Offset(IdxWidth, 0); + const Value *Base = + V.stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + + const auto &AA = + A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base)); + int64_t DerefBytes = 0; + if (!Stripped && this == &AA) { + // Use IR information if we did not strip anything. + // TODO: track globally. + bool CanBeNull; + DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull); + T.GlobalState.indicatePessimisticFixpoint(); + } else { + const DerefState &DS = static_cast<const DerefState &>(AA.getState()); + DerefBytes = DS.DerefBytesState.getAssumed(); + T.GlobalState &= DS.GlobalState; + } + + // TODO: Use `AAConstantRange` to infer dereferenceable bytes. + + // For now we do not try to "increase" dereferenceability due to negative + // indices as we first have to come up with code to deal with loops and + // for overflows of the dereferenceable bytes. + int64_t OffsetSExt = Offset.getSExtValue(); + if (OffsetSExt < 0) + OffsetSExt = 0; + + T.takeAssumedDerefBytesMinimum( + std::max(int64_t(0), DerefBytes - OffsetSExt)); + + if (this == &AA) { + if (!Stripped) { + // If nothing was stripped IR information is all we got. + T.takeKnownDerefBytesMaximum( + std::max(int64_t(0), DerefBytes - OffsetSExt)); + T.indicatePessimisticFixpoint(); + } else if (OffsetSExt > 0) { + // If something was stripped but there is circular reasoning we look + // for the offset. If it is positive we basically decrease the + // dereferenceable bytes in a circluar loop now, which will simply + // drive them down to the known value in a very slow way which we + // can accelerate. + T.indicatePessimisticFixpoint(); + } + } + + return T.isValidState(); + }; - /// Generate a predicate that checks if a given value is assumed nonnull. - /// The generated function returns true if a value satisfies any of - /// following conditions. - /// (i) A value is known nonZero(=nonnull). - /// (ii) A value is associated with AANonNull and its isAssumedNonNull() is - /// true. - std::function<bool(Value &)> generatePredicate(Attributor &); + DerefState T; + if (!genericValueTraversal<AADereferenceable, DerefState>( + A, getIRPosition(), *this, T, VisitValueCB)) + return indicatePessimisticFixpoint(); + + return Change | clampStateAndIndicateChange(getState(), T); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FLOATING_ATTR(dereferenceable) + } }; -std::function<bool(Value &)> AANonNullImpl::generatePredicate(Attributor &A) { - // FIXME: The `AAReturnedValues` should provide the predicate with the - // `ReturnInst` vector as well such that we can use the control flow sensitive - // version of `isKnownNonZero`. This should fix `test11` in - // `test/Transforms/FunctionAttrs/nonnull.ll` +/// Dereferenceable attribute for a return value. +struct AADereferenceableReturned final + : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl, + DerefState> { + AADereferenceableReturned(const IRPosition &IRP) + : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl, + DerefState>(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FNRET_ATTR(dereferenceable) + } +}; - std::function<bool(Value &)> Pred = [&](Value &RV) -> bool { - if (isKnownNonZero(&RV, getAnchorScope().getParent()->getDataLayout())) - return true; +/// Dereferenceable attribute for an argument +struct AADereferenceableArgument final + : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext< + AADereferenceable, AADereferenceableImpl, DerefState> { + using Base = AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext< + AADereferenceable, AADereferenceableImpl, DerefState>; + AADereferenceableArgument(const IRPosition &IRP) : Base(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_ARG_ATTR(dereferenceable) + } +}; - auto *NonNullAA = A.getAAFor<AANonNull>(*this, RV); +/// Dereferenceable attribute for a call site argument. +struct AADereferenceableCallSiteArgument final : AADereferenceableFloating { + AADereferenceableCallSiteArgument(const IRPosition &IRP) + : AADereferenceableFloating(IRP) {} - ImmutableCallSite ICS(&RV); + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_CSARG_ATTR(dereferenceable) + } +}; - if ((!NonNullAA || !NonNullAA->isAssumedNonNull()) && - (!ICS || !ICS.hasRetAttr(Attribute::NonNull))) - return false; +/// Dereferenceable attribute deduction for a call site return value. +struct AADereferenceableCallSiteReturned final + : AACallSiteReturnedFromReturnedAndMustBeExecutedContext< + AADereferenceable, AADereferenceableImpl> { + using Base = AACallSiteReturnedFromReturnedAndMustBeExecutedContext< + AADereferenceable, AADereferenceableImpl>; + AADereferenceableCallSiteReturned(const IRPosition &IRP) : Base(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_CS_ATTR(dereferenceable); + } +}; - return true; - }; +// ------------------------ Align Argument Attribute ------------------------ + +static unsigned int getKnownAlignForUse(Attributor &A, + AbstractAttribute &QueryingAA, + Value &AssociatedValue, const Use *U, + const Instruction *I, bool &TrackUse) { + // We need to follow common pointer manipulation uses to the accesses they + // feed into. + if (isa<CastInst>(I)) { + // Follow all but ptr2int casts. + TrackUse = !isa<PtrToIntInst>(I); + return 0; + } + if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) { + if (GEP->hasAllConstantIndices()) { + TrackUse = true; + return 0; + } + } + + unsigned Alignment = 0; + if (ImmutableCallSite ICS = ImmutableCallSite(I)) { + if (ICS.isBundleOperand(U) || ICS.isCallee(U)) + return 0; + + unsigned ArgNo = ICS.getArgumentNo(U); + IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo); + // As long as we only use known information there is no need to track + // dependences here. + auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP, + /* TrackDependence */ false); + Alignment = AlignAA.getKnownAlign(); + } + + const Value *UseV = U->get(); + if (auto *SI = dyn_cast<StoreInst>(I)) + Alignment = SI->getAlignment(); + else if (auto *LI = dyn_cast<LoadInst>(I)) + Alignment = LI->getAlignment(); - return Pred; + if (Alignment <= 1) + return 0; + + auto &DL = A.getDataLayout(); + int64_t Offset; + + if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) { + if (Base == &AssociatedValue) { + // BasePointerAddr + Offset = Alignment * Q for some integer Q. + // So we can say that the maximum power of two which is a divisor of + // gcd(Offset, Alignment) is an alignment. + + uint32_t gcd = + greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment); + Alignment = llvm::PowerOf2Floor(gcd); + } + } + + return Alignment; } +struct AAAlignImpl : AAAlign { + AAAlignImpl(const IRPosition &IRP) : AAAlign(IRP) {} -/// NonNull attribute for function return value. -struct AANonNullReturned : AANonNullImpl { + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + SmallVector<Attribute, 4> Attrs; + getAttrs({Attribute::Alignment}, Attrs); + for (const Attribute &Attr : Attrs) + takeKnownMaximum(Attr.getValueAsInt()); + + if (getIRPosition().isFnInterfaceKind() && + (!getAssociatedFunction() || + !getAssociatedFunction()->hasExactDefinition())) + indicatePessimisticFixpoint(); + } - AANonNullReturned(Function &F, InformationCache &InfoCache) - : AANonNullImpl(F, InfoCache) {} + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + ChangeStatus Changed = ChangeStatus::UNCHANGED; + + // Check for users that allow alignment annotations. + Value &AnchorVal = getIRPosition().getAnchorValue(); + for (const Use &U : AnchorVal.uses()) { + if (auto *SI = dyn_cast<StoreInst>(U.getUser())) { + if (SI->getPointerOperand() == &AnchorVal) + if (SI->getAlignment() < getAssumedAlign()) { + STATS_DECLTRACK(AAAlign, Store, + "Number of times alignment added to a store"); + SI->setAlignment(Align(getAssumedAlign())); + Changed = ChangeStatus::CHANGED; + } + } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) { + if (LI->getPointerOperand() == &AnchorVal) + if (LI->getAlignment() < getAssumedAlign()) { + LI->setAlignment(Align(getAssumedAlign())); + STATS_DECLTRACK(AAAlign, Load, + "Number of times alignment added to a load"); + Changed = ChangeStatus::CHANGED; + } + } + } + + return AAAlign::manifest(A) | Changed; + } + + // TODO: Provide a helper to determine the implied ABI alignment and check in + // the existing manifest method and a new one for AAAlignImpl that value + // to avoid making the alignment explicit if it did not improve. + + /// See AbstractAttribute::getDeducedAttributes + virtual void + getDeducedAttributes(LLVMContext &Ctx, + SmallVectorImpl<Attribute> &Attrs) const override { + if (getAssumedAlign() > 1) + Attrs.emplace_back( + Attribute::getWithAlignment(Ctx, Align(getAssumedAlign()))); + } + /// See AAFromMustBeExecutedContext + bool followUse(Attributor &A, const Use *U, const Instruction *I) { + bool TrackUse = false; + + unsigned int KnownAlign = + getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse); + takeKnownMaximum(KnownAlign); + + return TrackUse; + } + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) + + "-" + std::to_string(getAssumedAlign()) + ">") + : "unknown-align"; + } +}; + +/// Align attribute for a floating value. +struct AAAlignFloating : AAFromMustBeExecutedContext<AAAlign, AAAlignImpl> { + using Base = AAFromMustBeExecutedContext<AAAlign, AAAlignImpl>; + AAAlignFloating(const IRPosition &IRP) : Base(IRP) {} - /// See AbstractAttribute::getManifestPosition(). - ManifestPosition getManifestPosition() const override { return MP_RETURNED; } + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + Base::updateImpl(A); + + const DataLayout &DL = A.getDataLayout(); + + auto VisitValueCB = [&](Value &V, AAAlign::StateType &T, + bool Stripped) -> bool { + const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V)); + if (!Stripped && this == &AA) { + // Use only IR information if we did not strip anything. + const MaybeAlign PA = V.getPointerAlignment(DL); + T.takeKnownMaximum(PA ? PA->value() : 0); + T.indicatePessimisticFixpoint(); + } else { + // Use abstract attribute information. + const AAAlign::StateType &DS = + static_cast<const AAAlign::StateType &>(AA.getState()); + T ^= DS; + } + return T.isValidState(); + }; + + StateType T; + if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T, + VisitValueCB)) + return indicatePessimisticFixpoint(); + + // TODO: If we know we visited all incoming values, thus no are assumed + // dead, we can take the known information from the state T. + return clampStateAndIndicateChange(getState(), T); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) } +}; - /// See AbstractAttriubute::initialize(...). +/// Align attribute for function return value. +struct AAAlignReturned final + : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> { + AAAlignReturned(const IRPosition &IRP) + : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) } +}; + +/// Align attribute for function argument. +struct AAAlignArgument final + : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AAAlign, + AAAlignImpl> { + AAAlignArgument(const IRPosition &IRP) + : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AAAlign, + AAAlignImpl>( + IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) } +}; + +struct AAAlignCallSiteArgument final : AAAlignFloating { + AAAlignCallSiteArgument(const IRPosition &IRP) : AAAlignFloating(IRP) {} + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + return AAAlignImpl::manifest(A); + } + + /// See AbstractAttribute::updateImpl(Attributor &A). + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Changed = AAAlignFloating::updateImpl(A); + if (Argument *Arg = getAssociatedArgument()) { + const auto &ArgAlignAA = A.getAAFor<AAAlign>( + *this, IRPosition::argument(*Arg), /* TrackDependence */ false, + DepClassTy::OPTIONAL); + takeKnownMaximum(ArgAlignAA.getKnownAlign()); + } + return Changed; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) } +}; + +/// Align attribute deduction for a call site return value. +struct AAAlignCallSiteReturned final + : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AAAlign, + AAAlignImpl> { + using Base = + AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AAAlign, + AAAlignImpl>; + AAAlignCallSiteReturned(const IRPosition &IRP) : Base(IRP) {} + + /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - Function &F = getAnchorScope(); + Base::initialize(A); + Function *F = getAssociatedFunction(); + if (!F) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); } +}; - // Already nonnull. - if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex, - Attribute::NonNull)) +/// ------------------ Function No-Return Attribute ---------------------------- +struct AANoReturnImpl : public AANoReturn { + AANoReturnImpl(const IRPosition &IRP) : AANoReturn(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AANoReturn::initialize(A); + Function *F = getAssociatedFunction(); + if (!F) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + return getAssumed() ? "noreturn" : "may-return"; + } + + /// See AbstractAttribute::updateImpl(Attributor &A). + virtual ChangeStatus updateImpl(Attributor &A) override { + auto CheckForNoReturn = [](Instruction &) { return false; }; + if (!A.checkForAllInstructions(CheckForNoReturn, *this, + {(unsigned)Instruction::Ret})) + return indicatePessimisticFixpoint(); + return ChangeStatus::UNCHANGED; + } +}; + +struct AANoReturnFunction final : AANoReturnImpl { + AANoReturnFunction(const IRPosition &IRP) : AANoReturnImpl(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) } +}; + +/// NoReturn attribute deduction for a call sites. +struct AANoReturnCallSite final : AANoReturnImpl { + AANoReturnCallSite(const IRPosition &IRP) : AANoReturnImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos); + return clampStateAndIndicateChange( + getState(), + static_cast<const AANoReturn::StateType &>(FnAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); } +}; + +/// ----------------------- Variable Capturing --------------------------------- + +/// A class to hold the state of for no-capture attributes. +struct AANoCaptureImpl : public AANoCapture { + AANoCaptureImpl(const IRPosition &IRP) : AANoCapture(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) { indicateOptimisticFixpoint(); + return; + } + Function *AnchorScope = getAnchorScope(); + if (isFnInterfaceKind() && + (!AnchorScope || !AnchorScope->hasExactDefinition())) { + indicatePessimisticFixpoint(); + return; + } + + // You cannot "capture" null in the default address space. + if (isa<ConstantPointerNull>(getAssociatedValue()) && + getAssociatedValue().getType()->getPointerAddressSpace() == 0) { + indicateOptimisticFixpoint(); + return; + } + + const Function *F = getArgNo() >= 0 ? getAssociatedFunction() : AnchorScope; + + // Check what state the associated function can actually capture. + if (F) + determineFunctionCaptureCapabilities(getIRPosition(), *F, *this); + else + indicatePessimisticFixpoint(); } /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override; + + /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...). + virtual void + getDeducedAttributes(LLVMContext &Ctx, + SmallVectorImpl<Attribute> &Attrs) const override { + if (!isAssumedNoCaptureMaybeReturned()) + return; + + if (getArgNo() >= 0) { + if (isAssumedNoCapture()) + Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture)); + else if (ManifestInternal) + Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned")); + } + } + + /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known + /// depending on the ability of the function associated with \p IRP to capture + /// state in memory and through "returning/throwing", respectively. + static void determineFunctionCaptureCapabilities(const IRPosition &IRP, + const Function &F, + BitIntegerState &State) { + // TODO: Once we have memory behavior attributes we should use them here. + + // If we know we cannot communicate or write to memory, we do not care about + // ptr2int anymore. + if (F.onlyReadsMemory() && F.doesNotThrow() && + F.getReturnType()->isVoidTy()) { + State.addKnownBits(NO_CAPTURE); + return; + } + + // A function cannot capture state in memory if it only reads memory, it can + // however return/throw state and the state might be influenced by the + // pointer value, e.g., loading from a returned pointer might reveal a bit. + if (F.onlyReadsMemory()) + State.addKnownBits(NOT_CAPTURED_IN_MEM); + + // A function cannot communicate state back if it does not through + // exceptions and doesn not return values. + if (F.doesNotThrow() && F.getReturnType()->isVoidTy()) + State.addKnownBits(NOT_CAPTURED_IN_RET); + + // Check existing "returned" attributes. + int ArgNo = IRP.getArgNo(); + if (F.doesNotThrow() && ArgNo >= 0) { + for (unsigned u = 0, e = F.arg_size(); u < e; ++u) + if (F.hasParamAttribute(u, Attribute::Returned)) { + if (u == unsigned(ArgNo)) + State.removeAssumedBits(NOT_CAPTURED_IN_RET); + else if (F.onlyReadsMemory()) + State.addKnownBits(NO_CAPTURE); + else + State.addKnownBits(NOT_CAPTURED_IN_RET); + break; + } + } + } + + /// See AbstractState::getAsStr(). + const std::string getAsStr() const override { + if (isKnownNoCapture()) + return "known not-captured"; + if (isAssumedNoCapture()) + return "assumed not-captured"; + if (isKnownNoCaptureMaybeReturned()) + return "known not-captured-maybe-returned"; + if (isAssumedNoCaptureMaybeReturned()) + return "assumed not-captured-maybe-returned"; + return "assumed-captured"; + } }; -ChangeStatus AANonNullReturned::updateImpl(Attributor &A) { - Function &F = getAnchorScope(); +/// Attributor-aware capture tracker. +struct AACaptureUseTracker final : public CaptureTracker { - auto *AARetVal = A.getAAFor<AAReturnedValues>(*this, F); - if (!AARetVal) { - indicatePessimisticFixpoint(); - return ChangeStatus::CHANGED; + /// Create a capture tracker that can lookup in-flight abstract attributes + /// through the Attributor \p A. + /// + /// If a use leads to a potential capture, \p CapturedInMemory is set and the + /// search is stopped. If a use leads to a return instruction, + /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed. + /// If a use leads to a ptr2int which may capture the value, + /// \p CapturedInInteger is set. If a use is found that is currently assumed + /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies + /// set. All values in \p PotentialCopies are later tracked as well. For every + /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0, + /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger + /// conservatively set to true. + AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA, + const AAIsDead &IsDeadAA, AANoCapture::StateType &State, + SmallVectorImpl<const Value *> &PotentialCopies, + unsigned &RemainingUsesToExplore) + : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State), + PotentialCopies(PotentialCopies), + RemainingUsesToExplore(RemainingUsesToExplore) {} + + /// Determine if \p V maybe captured. *Also updates the state!* + bool valueMayBeCaptured(const Value *V) { + if (V->getType()->isPointerTy()) { + PointerMayBeCaptured(V, this); + } else { + State.indicatePessimisticFixpoint(); + } + return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED); } - std::function<bool(Value &)> Pred = this->generatePredicate(A); - if (!AARetVal->checkForallReturnedValues(Pred)) { - indicatePessimisticFixpoint(); - return ChangeStatus::CHANGED; + /// See CaptureTracker::tooManyUses(). + void tooManyUses() override { + State.removeAssumedBits(AANoCapture::NO_CAPTURE); } - return ChangeStatus::UNCHANGED; + + bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override { + if (CaptureTracker::isDereferenceableOrNull(O, DL)) + return true; + const auto &DerefAA = + A.getAAFor<AADereferenceable>(NoCaptureAA, IRPosition::value(*O)); + return DerefAA.getAssumedDereferenceableBytes(); + } + + /// See CaptureTracker::captured(...). + bool captured(const Use *U) override { + Instruction *UInst = cast<Instruction>(U->getUser()); + LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst + << "\n"); + + // Because we may reuse the tracker multiple times we keep track of the + // number of explored uses ourselves as well. + if (RemainingUsesToExplore-- == 0) { + LLVM_DEBUG(dbgs() << " - too many uses to explore!\n"); + return isCapturedIn(/* Memory */ true, /* Integer */ true, + /* Return */ true); + } + + // Deal with ptr2int by following uses. + if (isa<PtrToIntInst>(UInst)) { + LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n"); + return valueMayBeCaptured(UInst); + } + + // Explicitly catch return instructions. + if (isa<ReturnInst>(UInst)) + return isCapturedIn(/* Memory */ false, /* Integer */ false, + /* Return */ true); + + // For now we only use special logic for call sites. However, the tracker + // itself knows about a lot of other non-capturing cases already. + CallSite CS(UInst); + if (!CS || !CS.isArgOperand(U)) + return isCapturedIn(/* Memory */ true, /* Integer */ true, + /* Return */ true); + + unsigned ArgNo = CS.getArgumentNo(U); + const IRPosition &CSArgPos = IRPosition::callsite_argument(CS, ArgNo); + // If we have a abstract no-capture attribute for the argument we can use + // it to justify a non-capture attribute here. This allows recursion! + auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos); + if (ArgNoCaptureAA.isAssumedNoCapture()) + return isCapturedIn(/* Memory */ false, /* Integer */ false, + /* Return */ false); + if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) { + addPotentialCopy(CS); + return isCapturedIn(/* Memory */ false, /* Integer */ false, + /* Return */ false); + } + + // Lastly, we could not find a reason no-capture can be assumed so we don't. + return isCapturedIn(/* Memory */ true, /* Integer */ true, + /* Return */ true); + } + + /// Register \p CS as potential copy of the value we are checking. + void addPotentialCopy(CallSite CS) { + PotentialCopies.push_back(CS.getInstruction()); + } + + /// See CaptureTracker::shouldExplore(...). + bool shouldExplore(const Use *U) override { + // Check liveness. + return !IsDeadAA.isAssumedDead(cast<Instruction>(U->getUser())); + } + + /// Update the state according to \p CapturedInMem, \p CapturedInInt, and + /// \p CapturedInRet, then return the appropriate value for use in the + /// CaptureTracker::captured() interface. + bool isCapturedIn(bool CapturedInMem, bool CapturedInInt, + bool CapturedInRet) { + LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int " + << CapturedInInt << "|Ret " << CapturedInRet << "]\n"); + if (CapturedInMem) + State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM); + if (CapturedInInt) + State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT); + if (CapturedInRet) + State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET); + return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED); + } + +private: + /// The attributor providing in-flight abstract attributes. + Attributor &A; + + /// The abstract attribute currently updated. + AANoCapture &NoCaptureAA; + + /// The abstract liveness state. + const AAIsDead &IsDeadAA; + + /// The state currently updated. + AANoCapture::StateType &State; + + /// Set of potential copies of the tracked value. + SmallVectorImpl<const Value *> &PotentialCopies; + + /// Global counter to limit the number of explored uses. + unsigned &RemainingUsesToExplore; +}; + +ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { + const IRPosition &IRP = getIRPosition(); + const Value *V = + getArgNo() >= 0 ? IRP.getAssociatedArgument() : &IRP.getAssociatedValue(); + if (!V) + return indicatePessimisticFixpoint(); + + const Function *F = + getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope(); + assert(F && "Expected a function!"); + const IRPosition &FnPos = IRPosition::function(*F); + const auto &IsDeadAA = A.getAAFor<AAIsDead>(*this, FnPos); + + AANoCapture::StateType T; + + // Readonly means we cannot capture through memory. + const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos); + if (FnMemAA.isAssumedReadOnly()) { + T.addKnownBits(NOT_CAPTURED_IN_MEM); + if (FnMemAA.isKnownReadOnly()) + addKnownBits(NOT_CAPTURED_IN_MEM); + } + + // Make sure all returned values are different than the underlying value. + // TODO: we could do this in a more sophisticated way inside + // AAReturnedValues, e.g., track all values that escape through returns + // directly somehow. + auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) { + bool SeenConstant = false; + for (auto &It : RVAA.returned_values()) { + if (isa<Constant>(It.first)) { + if (SeenConstant) + return false; + SeenConstant = true; + } else if (!isa<Argument>(It.first) || + It.first == getAssociatedArgument()) + return false; + } + return true; + }; + + const auto &NoUnwindAA = A.getAAFor<AANoUnwind>(*this, FnPos); + if (NoUnwindAA.isAssumedNoUnwind()) { + bool IsVoidTy = F->getReturnType()->isVoidTy(); + const AAReturnedValues *RVAA = + IsVoidTy ? nullptr : &A.getAAFor<AAReturnedValues>(*this, FnPos); + if (IsVoidTy || CheckReturnedArgs(*RVAA)) { + T.addKnownBits(NOT_CAPTURED_IN_RET); + if (T.isKnown(NOT_CAPTURED_IN_MEM)) + return ChangeStatus::UNCHANGED; + if (NoUnwindAA.isKnownNoUnwind() && + (IsVoidTy || RVAA->getState().isAtFixpoint())) { + addKnownBits(NOT_CAPTURED_IN_RET); + if (isKnown(NOT_CAPTURED_IN_MEM)) + return indicateOptimisticFixpoint(); + } + } + } + + // Use the CaptureTracker interface and logic with the specialized tracker, + // defined in AACaptureUseTracker, that can look at in-flight abstract + // attributes and directly updates the assumed state. + SmallVector<const Value *, 4> PotentialCopies; + unsigned RemainingUsesToExplore = DefaultMaxUsesToExplore; + AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies, + RemainingUsesToExplore); + + // Check all potential copies of the associated value until we can assume + // none will be captured or we have to assume at least one might be. + unsigned Idx = 0; + PotentialCopies.push_back(V); + while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size()) + Tracker.valueMayBeCaptured(PotentialCopies[Idx++]); + + AANoCapture::StateType &S = getState(); + auto Assumed = S.getAssumed(); + S.intersectAssumedBits(T.getAssumed()); + if (!isAssumedNoCaptureMaybeReturned()) + return indicatePessimisticFixpoint(); + return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED + : ChangeStatus::CHANGED; } -/// NonNull attribute for function argument. -struct AANonNullArgument : AANonNullImpl { +/// NoCapture attribute for function arguments. +struct AANoCaptureArgument final : AANoCaptureImpl { + AANoCaptureArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {} - AANonNullArgument(Argument &A, InformationCache &InfoCache) - : AANonNullImpl(A, InfoCache) {} + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) } +}; - /// See AbstractAttribute::getManifestPosition(). - ManifestPosition getManifestPosition() const override { return MP_ARGUMENT; } +/// NoCapture attribute for call site arguments. +struct AANoCaptureCallSiteArgument final : AANoCaptureImpl { + AANoCaptureCallSiteArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {} - /// See AbstractAttriubute::initialize(...). + /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - Argument *Arg = cast<Argument>(getAssociatedValue()); - if (Arg->hasNonNullAttr()) - indicateOptimisticFixpoint(); + if (Argument *Arg = getAssociatedArgument()) + if (Arg->hasByValAttr()) + indicateOptimisticFixpoint(); + AANoCaptureImpl::initialize(A); } /// See AbstractAttribute::updateImpl(...). - ChangeStatus updateImpl(Attributor &A) override; + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Argument *Arg = getAssociatedArgument(); + if (!Arg) + return indicatePessimisticFixpoint(); + const IRPosition &ArgPos = IRPosition::argument(*Arg); + auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos); + return clampStateAndIndicateChange( + getState(), + static_cast<const AANoCapture::StateType &>(ArgAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)}; }; -/// NonNull attribute for a call site argument. -struct AANonNullCallSiteArgument : AANonNullImpl { +/// NoCapture attribute for floating values. +struct AANoCaptureFloating final : AANoCaptureImpl { + AANoCaptureFloating(const IRPosition &IRP) : AANoCaptureImpl(IRP) {} - /// See AANonNullImpl::AANonNullImpl(...). - AANonNullCallSiteArgument(CallSite CS, unsigned ArgNo, - InformationCache &InfoCache) - : AANonNullImpl(CS.getArgOperand(ArgNo), *CS.getInstruction(), InfoCache), - ArgNo(ArgNo) {} + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FLOATING_ATTR(nocapture) + } +}; + +/// NoCapture attribute for function return value. +struct AANoCaptureReturned final : AANoCaptureImpl { + AANoCaptureReturned(const IRPosition &IRP) : AANoCaptureImpl(IRP) { + llvm_unreachable("NoCapture is not applicable to function returns!"); + } /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - CallSite CS(&getAnchoredValue()); - if (isKnownNonZero(getAssociatedValue(), - getAnchorScope().getParent()->getDataLayout()) || - CS.paramHasAttr(ArgNo, getAttrKind())) - indicateOptimisticFixpoint(); + llvm_unreachable("NoCapture is not applicable to function returns!"); } - /// See AbstractAttribute::updateImpl(Attributor &A). - ChangeStatus updateImpl(Attributor &A) override; + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + llvm_unreachable("NoCapture is not applicable to function returns!"); + } - /// See AbstractAttribute::getManifestPosition(). - ManifestPosition getManifestPosition() const override { - return MP_CALL_SITE_ARGUMENT; - }; + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} +}; - // Return argument index of associated value. - int getArgNo() const { return ArgNo; } +/// NoCapture attribute deduction for a call site return value. +struct AANoCaptureCallSiteReturned final : AANoCaptureImpl { + AANoCaptureCallSiteReturned(const IRPosition &IRP) : AANoCaptureImpl(IRP) {} -private: - unsigned ArgNo; + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_CSRET_ATTR(nocapture) + } }; -ChangeStatus AANonNullArgument::updateImpl(Attributor &A) { - Function &F = getAnchorScope(); - Argument &Arg = cast<Argument>(getAnchoredValue()); - unsigned ArgNo = Arg.getArgNo(); +/// ------------------ Value Simplify Attribute ---------------------------- +struct AAValueSimplifyImpl : AAValueSimplify { + AAValueSimplifyImpl(const IRPosition &IRP) : AAValueSimplify(IRP) {} + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple") + : "not-simple"; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + + /// See AAValueSimplify::getAssumedSimplifiedValue() + Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override { + if (!getAssumed()) + return const_cast<Value *>(&getAssociatedValue()); + return SimplifiedAssociatedValue; + } + void initialize(Attributor &A) override {} + + /// Helper function for querying AAValueSimplify and updating candicate. + /// \param QueryingValue Value trying to unify with SimplifiedValue + /// \param AccumulatedSimplifiedValue Current simplification result. + static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA, + Value &QueryingValue, + Optional<Value *> &AccumulatedSimplifiedValue) { + // FIXME: Add a typecast support. - // Callback function - std::function<bool(CallSite)> CallSiteCheck = [&](CallSite CS) { - assert(CS && "Sanity check: Call site was not initialized properly!"); + auto &ValueSimpifyAA = A.getAAFor<AAValueSimplify>( + QueryingAA, IRPosition::value(QueryingValue)); - auto *NonNullAA = A.getAAFor<AANonNull>(*this, *CS.getInstruction(), ArgNo); + Optional<Value *> QueryingValueSimplified = + ValueSimpifyAA.getAssumedSimplifiedValue(A); - // Check that NonNullAA is AANonNullCallSiteArgument. - if (NonNullAA) { - ImmutableCallSite ICS(&NonNullAA->getAnchoredValue()); - if (ICS && CS.getInstruction() == ICS.getInstruction()) - return NonNullAA->isAssumedNonNull(); + if (!QueryingValueSimplified.hasValue()) + return true; + + if (!QueryingValueSimplified.getValue()) return false; + + Value &QueryingValueSimplifiedUnwrapped = + *QueryingValueSimplified.getValue(); + + if (isa<UndefValue>(QueryingValueSimplifiedUnwrapped)) + return true; + + if (AccumulatedSimplifiedValue.hasValue()) + return AccumulatedSimplifiedValue == QueryingValueSimplified; + + LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << QueryingValue + << " is assumed to be " + << QueryingValueSimplifiedUnwrapped << "\n"); + + AccumulatedSimplifiedValue = QueryingValueSimplified; + return true; + } + + bool askSimplifiedValueForAAValueConstantRange(Attributor &A) { + if (!getAssociatedValue().getType()->isIntegerTy()) + return false; + + const auto &ValueConstantRangeAA = + A.getAAFor<AAValueConstantRange>(*this, getIRPosition()); + + Optional<ConstantInt *> COpt = + ValueConstantRangeAA.getAssumedConstantInt(A); + if (COpt.hasValue()) { + if (auto *C = COpt.getValue()) + SimplifiedAssociatedValue = C; + else + return false; + } else { + // FIXME: It should be llvm::None but if you set llvm::None, + // values are mistakenly infered as `undef` now. + SimplifiedAssociatedValue = &getAssociatedValue(); + } + return true; + } + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + ChangeStatus Changed = ChangeStatus::UNCHANGED; + + if (!SimplifiedAssociatedValue.hasValue() || + !SimplifiedAssociatedValue.getValue()) + return Changed; + + if (auto *C = dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())) { + // We can replace the AssociatedValue with the constant. + Value &V = getAssociatedValue(); + if (!V.user_empty() && &V != C && V.getType() == C->getType()) { + LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << V << " -> " << *C + << "\n"); + A.changeValueAfterManifest(V, *C); + Changed = ChangeStatus::CHANGED; + } + } + + return Changed | AAValueSimplify::manifest(A); + } + + /// See AbstractState::indicatePessimisticFixpoint(...). + ChangeStatus indicatePessimisticFixpoint() override { + // NOTE: Associated value will be returned in a pessimistic fixpoint and is + // regarded as known. That's why`indicateOptimisticFixpoint` is called. + SimplifiedAssociatedValue = &getAssociatedValue(); + indicateOptimisticFixpoint(); + return ChangeStatus::CHANGED; + } + +protected: + // An assumed simplified value. Initially, it is set to Optional::None, which + // means that the value is not clear under current assumption. If in the + // pessimistic state, getAssumedSimplifiedValue doesn't return this value but + // returns orignal associated value. + Optional<Value *> SimplifiedAssociatedValue; +}; + +struct AAValueSimplifyArgument final : AAValueSimplifyImpl { + AAValueSimplifyArgument(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {} + + void initialize(Attributor &A) override { + AAValueSimplifyImpl::initialize(A); + if (!getAssociatedFunction() || getAssociatedFunction()->isDeclaration()) + indicatePessimisticFixpoint(); + if (hasAttr({Attribute::InAlloca, Attribute::StructRet, Attribute::Nest}, + /* IgnoreSubsumingPositions */ true)) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // Byval is only replacable if it is readonly otherwise we would write into + // the replaced value and not the copy that byval creates implicitly. + Argument *Arg = getAssociatedArgument(); + if (Arg->hasByValAttr()) { + const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition()); + if (!MemAA.isAssumedReadOnly()) + return indicatePessimisticFixpoint(); + } + + bool HasValueBefore = SimplifiedAssociatedValue.hasValue(); + + auto PredForCallSite = [&](AbstractCallSite ACS) { + // Check if we have an associated argument or not (which can happen for + // callback calls). + Value *ArgOp = ACS.getCallArgOperand(getArgNo()); + if (!ArgOp) + return false; + // We can only propagate thread independent values through callbacks. + // This is different to direct/indirect call sites because for them we + // know the thread executing the caller and callee is the same. For + // callbacks this is not guaranteed, thus a thread dependent value could + // be different for the caller and callee, making it invalid to propagate. + if (ACS.isCallbackCall()) + if (auto *C = dyn_cast<Constant>(ArgOp)) + if (C->isThreadDependent()) + return false; + return checkAndUpdate(A, *this, *ArgOp, SimplifiedAssociatedValue); + }; + + if (!A.checkForAllCallSites(PredForCallSite, *this, true)) + if (!askSimplifiedValueForAAValueConstantRange(A)) + return indicatePessimisticFixpoint(); + + // If a candicate was found in this update, return CHANGED. + return HasValueBefore == SimplifiedAssociatedValue.hasValue() + ? ChangeStatus::UNCHANGED + : ChangeStatus ::CHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_ARG_ATTR(value_simplify) + } +}; + +struct AAValueSimplifyReturned : AAValueSimplifyImpl { + AAValueSimplifyReturned(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + bool HasValueBefore = SimplifiedAssociatedValue.hasValue(); + + auto PredForReturned = [&](Value &V) { + return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue); + }; + + if (!A.checkForAllReturnedValues(PredForReturned, *this)) + if (!askSimplifiedValueForAAValueConstantRange(A)) + return indicatePessimisticFixpoint(); + + // If a candicate was found in this update, return CHANGED. + return HasValueBefore == SimplifiedAssociatedValue.hasValue() + ? ChangeStatus::UNCHANGED + : ChangeStatus ::CHANGED; + } + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FNRET_ATTR(value_simplify) + } +}; + +struct AAValueSimplifyFloating : AAValueSimplifyImpl { + AAValueSimplifyFloating(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + Value &V = getAnchorValue(); + + // TODO: add other stuffs + if (isa<Constant>(V)) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + bool HasValueBefore = SimplifiedAssociatedValue.hasValue(); + + auto VisitValueCB = [&](Value &V, BooleanState, bool Stripped) -> bool { + auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V)); + if (!Stripped && this == &AA) { + // TODO: Look the instruction and check recursively. + + LLVM_DEBUG( + dbgs() << "[Attributor][ValueSimplify] Can't be stripped more : " + << V << "\n"); + return false; + } + return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue); + }; + + if (!genericValueTraversal<AAValueSimplify, BooleanState>( + A, getIRPosition(), *this, static_cast<BooleanState &>(*this), + VisitValueCB)) + if (!askSimplifiedValueForAAValueConstantRange(A)) + return indicatePessimisticFixpoint(); + + // If a candicate was found in this update, return CHANGED. + + return HasValueBefore == SimplifiedAssociatedValue.hasValue() + ? ChangeStatus::UNCHANGED + : ChangeStatus ::CHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FLOATING_ATTR(value_simplify) + } +}; + +struct AAValueSimplifyFunction : AAValueSimplifyImpl { + AAValueSimplifyFunction(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + SimplifiedAssociatedValue = &getAnchorValue(); + indicateOptimisticFixpoint(); + } + /// See AbstractAttribute::initialize(...). + ChangeStatus updateImpl(Attributor &A) override { + llvm_unreachable( + "AAValueSimplify(Function|CallSite)::updateImpl will not be called"); + } + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FN_ATTR(value_simplify) + } +}; + +struct AAValueSimplifyCallSite : AAValueSimplifyFunction { + AAValueSimplifyCallSite(const IRPosition &IRP) + : AAValueSimplifyFunction(IRP) {} + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_CS_ATTR(value_simplify) + } +}; + +struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned { + AAValueSimplifyCallSiteReturned(const IRPosition &IRP) + : AAValueSimplifyReturned(IRP) {} + + void trackStatistics() const override { + STATS_DECLTRACK_CSRET_ATTR(value_simplify) + } +}; +struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating { + AAValueSimplifyCallSiteArgument(const IRPosition &IRP) + : AAValueSimplifyFloating(IRP) {} + + void trackStatistics() const override { + STATS_DECLTRACK_CSARG_ATTR(value_simplify) + } +}; + +/// ----------------------- Heap-To-Stack Conversion --------------------------- +struct AAHeapToStackImpl : public AAHeapToStack { + AAHeapToStackImpl(const IRPosition &IRP) : AAHeapToStack(IRP) {} + + const std::string getAsStr() const override { + return "[H2S] Mallocs: " + std::to_string(MallocCalls.size()); + } + + ChangeStatus manifest(Attributor &A) override { + assert(getState().isValidState() && + "Attempted to manifest an invalid state!"); + + ChangeStatus HasChanged = ChangeStatus::UNCHANGED; + Function *F = getAssociatedFunction(); + const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); + + for (Instruction *MallocCall : MallocCalls) { + // This malloc cannot be replaced. + if (BadMallocCalls.count(MallocCall)) + continue; + + for (Instruction *FreeCall : FreesForMalloc[MallocCall]) { + LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n"); + A.deleteAfterManifest(*FreeCall); + HasChanged = ChangeStatus::CHANGED; + } + + LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall + << "\n"); + + Constant *Size; + if (isCallocLikeFn(MallocCall, TLI)) { + auto *Num = cast<ConstantInt>(MallocCall->getOperand(0)); + auto *SizeT = dyn_cast<ConstantInt>(MallocCall->getOperand(1)); + APInt TotalSize = SizeT->getValue() * Num->getValue(); + Size = + ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize); + } else { + Size = cast<ConstantInt>(MallocCall->getOperand(0)); + } + + unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace(); + Instruction *AI = new AllocaInst(Type::getInt8Ty(F->getContext()), AS, + Size, "", MallocCall->getNextNode()); + + if (AI->getType() != MallocCall->getType()) + AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc", + AI->getNextNode()); + + replaceAllInstructionUsesWith(*MallocCall, *AI); + + if (auto *II = dyn_cast<InvokeInst>(MallocCall)) { + auto *NBB = II->getNormalDest(); + BranchInst::Create(NBB, MallocCall->getParent()); + A.deleteAfterManifest(*MallocCall); + } else { + A.deleteAfterManifest(*MallocCall); + } + + if (isCallocLikeFn(MallocCall, TLI)) { + auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc", + AI->getNextNode()); + Value *Ops[] = { + BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size, + ConstantInt::get(Type::getInt1Ty(F->getContext()), false)}; + + Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()}; + Module *M = F->getParent(); + Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys); + CallInst::Create(Fn, Ops, "", BI->getNextNode()); + } + HasChanged = ChangeStatus::CHANGED; } - if (CS.paramHasAttr(ArgNo, Attribute::NonNull)) + return HasChanged; + } + + /// Collection of all malloc calls in a function. + SmallSetVector<Instruction *, 4> MallocCalls; + + /// Collection of malloc calls that cannot be converted. + DenseSet<const Instruction *> BadMallocCalls; + + /// A map for each malloc call to the set of associated free calls. + DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc; + + ChangeStatus updateImpl(Attributor &A) override; +}; + +ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) { + const Function *F = getAssociatedFunction(); + const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); + + MustBeExecutedContextExplorer &Explorer = + A.getInfoCache().getMustBeExecutedContextExplorer(); + + auto FreeCheck = [&](Instruction &I) { + const auto &Frees = FreesForMalloc.lookup(&I); + if (Frees.size() != 1) + return false; + Instruction *UniqueFree = *Frees.begin(); + return Explorer.findInContextOf(UniqueFree, I.getNextNode()); + }; + + auto UsesCheck = [&](Instruction &I) { + bool ValidUsesOnly = true; + bool MustUse = true; + auto Pred = [&](const Use &U, bool &Follow) -> bool { + Instruction *UserI = cast<Instruction>(U.getUser()); + if (isa<LoadInst>(UserI)) + return true; + if (auto *SI = dyn_cast<StoreInst>(UserI)) { + if (SI->getValueOperand() == U.get()) { + LLVM_DEBUG(dbgs() + << "[H2S] escaping store to memory: " << *UserI << "\n"); + ValidUsesOnly = false; + } else { + // A store into the malloc'ed memory is fine. + } + return true; + } + if (auto *CB = dyn_cast<CallBase>(UserI)) { + if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd()) + return true; + // Record malloc. + if (isFreeCall(UserI, TLI)) { + if (MustUse) { + FreesForMalloc[&I].insert(UserI); + } else { + LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: " + << *UserI << "\n"); + ValidUsesOnly = false; + } + return true; + } + + unsigned ArgNo = CB->getArgOperandNo(&U); + + const auto &NoCaptureAA = A.getAAFor<AANoCapture>( + *this, IRPosition::callsite_argument(*CB, ArgNo)); + + // If a callsite argument use is nofree, we are fine. + const auto &ArgNoFreeAA = A.getAAFor<AANoFree>( + *this, IRPosition::callsite_argument(*CB, ArgNo)); + + if (!NoCaptureAA.isAssumedNoCapture() || + !ArgNoFreeAA.isAssumedNoFree()) { + LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n"); + ValidUsesOnly = false; + } + return true; + } + + if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) || + isa<PHINode>(UserI) || isa<SelectInst>(UserI)) { + MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI)); + Follow = true; + return true; + } + // Unknown user for which we can not track uses further (in a way that + // makes sense). + LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n"); + ValidUsesOnly = false; return true; + }; + A.checkForAllUses(Pred, *this, I); + return ValidUsesOnly; + }; - Value *V = CS.getArgOperand(ArgNo); - if (isKnownNonZero(V, getAnchorScope().getParent()->getDataLayout())) + auto MallocCallocCheck = [&](Instruction &I) { + if (BadMallocCalls.count(&I)) return true; - return false; + bool IsMalloc = isMallocLikeFn(&I, TLI); + bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI); + if (!IsMalloc && !IsCalloc) { + BadMallocCalls.insert(&I); + return true; + } + + if (IsMalloc) { + if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0))) + if (Size->getValue().ule(MaxHeapToStackSize)) + if (UsesCheck(I) || FreeCheck(I)) { + MallocCalls.insert(&I); + return true; + } + } else if (IsCalloc) { + bool Overflow = false; + if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0))) + if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1))) + if ((Size->getValue().umul_ov(Num->getValue(), Overflow)) + .ule(MaxHeapToStackSize)) + if (!Overflow && (UsesCheck(I) || FreeCheck(I))) { + MallocCalls.insert(&I); + return true; + } + } + + BadMallocCalls.insert(&I); + return true; }; - if (!A.checkForAllCallSites(F, CallSiteCheck, true)) { - indicatePessimisticFixpoint(); + + size_t NumBadMallocs = BadMallocCalls.size(); + + A.checkForAllCallLikeInstructions(MallocCallocCheck, *this); + + if (NumBadMallocs != BadMallocCalls.size()) return ChangeStatus::CHANGED; - } + return ChangeStatus::UNCHANGED; } -ChangeStatus AANonNullCallSiteArgument::updateImpl(Attributor &A) { - // NOTE: Never look at the argument of the callee in this method. - // If we do this, "nonnull" is always deduced because of the assumption. +struct AAHeapToStackFunction final : public AAHeapToStackImpl { + AAHeapToStackFunction(const IRPosition &IRP) : AAHeapToStackImpl(IRP) {} - Value &V = *getAssociatedValue(); + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECL(MallocCalls, Function, + "Number of malloc calls converted to allocas"); + for (auto *C : MallocCalls) + if (!BadMallocCalls.count(C)) + ++BUILD_STAT_NAME(MallocCalls, Function); + } +}; - auto *NonNullAA = A.getAAFor<AANonNull>(*this, V); +/// -------------------- Memory Behavior Attributes ---------------------------- +/// Includes read-none, read-only, and write-only. +/// ---------------------------------------------------------------------------- +struct AAMemoryBehaviorImpl : public AAMemoryBehavior { + AAMemoryBehaviorImpl(const IRPosition &IRP) : AAMemoryBehavior(IRP) {} - if (!NonNullAA || !NonNullAA->isAssumedNonNull()) { - indicatePessimisticFixpoint(); - return ChangeStatus::CHANGED; + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + intersectAssumedBits(BEST_STATE); + getKnownStateFromValue(getIRPosition(), getState()); + IRAttribute::initialize(A); } - return ChangeStatus::UNCHANGED; -} + /// Return the memory behavior information encoded in the IR for \p IRP. + static void getKnownStateFromValue(const IRPosition &IRP, + BitIntegerState &State, + bool IgnoreSubsumingPositions = false) { + SmallVector<Attribute, 2> Attrs; + IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions); + for (const Attribute &Attr : Attrs) { + switch (Attr.getKindAsEnum()) { + case Attribute::ReadNone: + State.addKnownBits(NO_ACCESSES); + break; + case Attribute::ReadOnly: + State.addKnownBits(NO_WRITES); + break; + case Attribute::WriteOnly: + State.addKnownBits(NO_READS); + break; + default: + llvm_unreachable("Unexpcted attribute!"); + } + } -/// ------------------------ Will-Return Attributes ---------------------------- + if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) { + if (!I->mayReadFromMemory()) + State.addKnownBits(NO_READS); + if (!I->mayWriteToMemory()) + State.addKnownBits(NO_WRITES); + } + } -struct AAWillReturnImpl : public AAWillReturn, BooleanState { + /// See AbstractAttribute::getDeducedAttributes(...). + void getDeducedAttributes(LLVMContext &Ctx, + SmallVectorImpl<Attribute> &Attrs) const override { + assert(Attrs.size() == 0); + if (isAssumedReadNone()) + Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone)); + else if (isAssumedReadOnly()) + Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly)); + else if (isAssumedWriteOnly()) + Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly)); + assert(Attrs.size() <= 1); + } - /// See AbstractAttribute::AbstractAttribute(...). - AAWillReturnImpl(Function &F, InformationCache &InfoCache) - : AAWillReturn(F, InfoCache) {} + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + const IRPosition &IRP = getIRPosition(); + + // Check if we would improve the existing attributes first. + SmallVector<Attribute, 4> DeducedAttrs; + getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs); + if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) { + return IRP.hasAttr(Attr.getKindAsEnum(), + /* IgnoreSubsumingPositions */ true); + })) + return ChangeStatus::UNCHANGED; + + // Clear existing attributes. + IRP.removeAttrs(AttrKinds); + + // Use the generic manifest method. + return IRAttribute::manifest(A); + } - /// See AAWillReturn::isKnownWillReturn(). - bool isKnownWillReturn() const override { return getKnown(); } + /// See AbstractState::getAsStr(). + const std::string getAsStr() const override { + if (isAssumedReadNone()) + return "readnone"; + if (isAssumedReadOnly()) + return "readonly"; + if (isAssumedWriteOnly()) + return "writeonly"; + return "may-read/write"; + } - /// See AAWillReturn::isAssumedWillReturn(). - bool isAssumedWillReturn() const override { return getAssumed(); } + /// The set of IR attributes AAMemoryBehavior deals with. + static const Attribute::AttrKind AttrKinds[3]; +}; - /// See AbstractAttribute::getState(...). - AbstractState &getState() override { return *this; } +const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = { + Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly}; - /// See AbstractAttribute::getState(...). - const AbstractState &getState() const override { return *this; } +/// Memory behavior attribute for a floating value. +struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl { + AAMemoryBehaviorFloating(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {} - /// See AbstractAttribute::getAsStr() - const std::string getAsStr() const override { - return getAssumed() ? "willreturn" : "may-noreturn"; + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAMemoryBehaviorImpl::initialize(A); + // Initialize the use vector with all direct uses of the associated value. + for (const Use &U : getAssociatedValue().uses()) + Uses.insert(&U); } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override; + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + if (isAssumedReadNone()) + STATS_DECLTRACK_FLOATING_ATTR(readnone) + else if (isAssumedReadOnly()) + STATS_DECLTRACK_FLOATING_ATTR(readonly) + else if (isAssumedWriteOnly()) + STATS_DECLTRACK_FLOATING_ATTR(writeonly) + } + +private: + /// Return true if users of \p UserI might access the underlying + /// variable/location described by \p U and should therefore be analyzed. + bool followUsersOfUseIn(Attributor &A, const Use *U, + const Instruction *UserI); + + /// Update the state according to the effect of use \p U in \p UserI. + void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI); + +protected: + /// Container for (transitive) uses of the associated argument. + SetVector<const Use *> Uses; }; -struct AAWillReturnFunction final : AAWillReturnImpl { +/// Memory behavior attribute for function argument. +struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating { + AAMemoryBehaviorArgument(const IRPosition &IRP) + : AAMemoryBehaviorFloating(IRP) {} - /// See AbstractAttribute::AbstractAttribute(...). - AAWillReturnFunction(Function &F, InformationCache &InfoCache) - : AAWillReturnImpl(F, InfoCache) {} + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + intersectAssumedBits(BEST_STATE); + const IRPosition &IRP = getIRPosition(); + // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we + // can query it when we use has/getAttr. That would allow us to reuse the + // initialize of the base class here. + bool HasByVal = + IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true); + getKnownStateFromValue(IRP, getState(), + /* IgnoreSubsumingPositions */ HasByVal); + + // Initialize the use vector with all direct uses of the associated value. + Argument *Arg = getAssociatedArgument(); + if (!Arg || !Arg->getParent()->hasExactDefinition()) { + indicatePessimisticFixpoint(); + } else { + // Initialize the use vector with all direct uses of the associated value. + for (const Use &U : Arg->uses()) + Uses.insert(&U); + } + } - /// See AbstractAttribute::getManifestPosition(). - ManifestPosition getManifestPosition() const override { - return MP_FUNCTION; + ChangeStatus manifest(Attributor &A) override { + // TODO: From readattrs.ll: "inalloca parameters are always + // considered written" + if (hasAttr({Attribute::InAlloca})) { + removeKnownBits(NO_WRITES); + removeAssumedBits(NO_WRITES); + } + return AAMemoryBehaviorFloating::manifest(A); } + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + if (isAssumedReadNone()) + STATS_DECLTRACK_ARG_ATTR(readnone) + else if (isAssumedReadOnly()) + STATS_DECLTRACK_ARG_ATTR(readonly) + else if (isAssumedWriteOnly()) + STATS_DECLTRACK_ARG_ATTR(writeonly) + } +}; + +struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument { + AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP) + : AAMemoryBehaviorArgument(IRP) {} + /// See AbstractAttribute::initialize(...). - void initialize(Attributor &A) override; + void initialize(Attributor &A) override { + if (Argument *Arg = getAssociatedArgument()) { + if (Arg->hasByValAttr()) { + addKnownBits(NO_WRITES); + removeKnownBits(NO_READS); + removeAssumedBits(NO_READS); + } + } else { + } + AAMemoryBehaviorArgument::initialize(A); + } /// See AbstractAttribute::updateImpl(...). - ChangeStatus updateImpl(Attributor &A) override; + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Argument *Arg = getAssociatedArgument(); + const IRPosition &ArgPos = IRPosition::argument(*Arg); + auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos); + return clampStateAndIndicateChange( + getState(), + static_cast<const AAMemoryBehavior::StateType &>(ArgAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + if (isAssumedReadNone()) + STATS_DECLTRACK_CSARG_ATTR(readnone) + else if (isAssumedReadOnly()) + STATS_DECLTRACK_CSARG_ATTR(readonly) + else if (isAssumedWriteOnly()) + STATS_DECLTRACK_CSARG_ATTR(writeonly) + } }; -// Helper function that checks whether a function has any cycle. -// TODO: Replace with more efficent code -bool containsCycle(Function &F) { - SmallPtrSet<BasicBlock *, 32> Visited; +/// Memory behavior attribute for a call site return position. +struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating { + AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP) + : AAMemoryBehaviorFloating(IRP) {} - // Traverse BB by dfs and check whether successor is already visited. - for (BasicBlock *BB : depth_first(&F)) { - Visited.insert(BB); - for (auto *SuccBB : successors(BB)) { - if (Visited.count(SuccBB)) - return true; + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + // We do not annotate returned values. + return ChangeStatus::UNCHANGED; + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} +}; + +/// An AA to represent the memory behavior function attributes. +struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl { + AAMemoryBehaviorFunction(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(Attributor &A). + virtual ChangeStatus updateImpl(Attributor &A) override; + + /// See AbstractAttribute::manifest(...). + ChangeStatus manifest(Attributor &A) override { + Function &F = cast<Function>(getAnchorValue()); + if (isAssumedReadNone()) { + F.removeFnAttr(Attribute::ArgMemOnly); + F.removeFnAttr(Attribute::InaccessibleMemOnly); + F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly); } + return AAMemoryBehaviorImpl::manifest(A); } - return false; + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + if (isAssumedReadNone()) + STATS_DECLTRACK_FN_ATTR(readnone) + else if (isAssumedReadOnly()) + STATS_DECLTRACK_FN_ATTR(readonly) + else if (isAssumedWriteOnly()) + STATS_DECLTRACK_FN_ATTR(writeonly) + } +}; + +/// AAMemoryBehavior attribute for call sites. +struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl { + AAMemoryBehaviorCallSite(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAMemoryBehaviorImpl::initialize(A); + Function *F = getAssociatedFunction(); + if (!F || !F->hasExactDefinition()) + indicatePessimisticFixpoint(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Once we have call site specific value information we can provide + // call site specific liveness liveness information and then it makes + // sense to specialize attributes for call sites arguments instead of + // redirecting requests to the callee argument. + Function *F = getAssociatedFunction(); + const IRPosition &FnPos = IRPosition::function(*F); + auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos); + return clampStateAndIndicateChange( + getState(), + static_cast<const AAMemoryBehavior::StateType &>(FnAA.getState())); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + if (isAssumedReadNone()) + STATS_DECLTRACK_CS_ATTR(readnone) + else if (isAssumedReadOnly()) + STATS_DECLTRACK_CS_ATTR(readonly) + else if (isAssumedWriteOnly()) + STATS_DECLTRACK_CS_ATTR(writeonly) + } +}; +} // namespace + +ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) { + + // The current assumed state used to determine a change. + auto AssumedState = getAssumed(); + + auto CheckRWInst = [&](Instruction &I) { + // If the instruction has an own memory behavior state, use it to restrict + // the local state. No further analysis is required as the other memory + // state is as optimistic as it gets. + if (ImmutableCallSite ICS = ImmutableCallSite(&I)) { + const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>( + *this, IRPosition::callsite_function(ICS)); + intersectAssumedBits(MemBehaviorAA.getAssumed()); + return !isAtFixpoint(); + } + + // Remove access kind modifiers if necessary. + if (I.mayReadFromMemory()) + removeAssumedBits(NO_READS); + if (I.mayWriteToMemory()) + removeAssumedBits(NO_WRITES); + return !isAtFixpoint(); + }; + + if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this)) + return indicatePessimisticFixpoint(); + + return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED + : ChangeStatus::UNCHANGED; } -// Helper function that checks the function have a loop which might become an -// endless loop -// FIXME: Any cycle is regarded as endless loop for now. -// We have to allow some patterns. -bool containsPossiblyEndlessLoop(Function &F) { return containsCycle(F); } +ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) { + + const IRPosition &IRP = getIRPosition(); + const IRPosition &FnPos = IRPosition::function_scope(IRP); + AAMemoryBehavior::StateType &S = getState(); + + // First, check the function scope. We take the known information and we avoid + // work if the assumed information implies the current assumed information for + // this attribute. This is a valid for all but byval arguments. + Argument *Arg = IRP.getAssociatedArgument(); + AAMemoryBehavior::base_t FnMemAssumedState = + AAMemoryBehavior::StateType::getWorstState(); + if (!Arg || !Arg->hasByValAttr()) { + const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos); + FnMemAssumedState = FnMemAA.getAssumed(); + S.addKnownBits(FnMemAA.getKnown()); + if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed()) + return ChangeStatus::UNCHANGED; + } -void AAWillReturnFunction::initialize(Attributor &A) { - Function &F = getAnchorScope(); + // Make sure the value is not captured (except through "return"), if + // it is, any information derived would be irrelevant anyway as we cannot + // check the potential aliases introduced by the capture. However, no need + // to fall back to anythign less optimistic than the function state. + const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>( + *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL); + if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) { + S.intersectAssumedBits(FnMemAssumedState); + return ChangeStatus::CHANGED; + } - if (containsPossiblyEndlessLoop(F)) - indicatePessimisticFixpoint(); + // The current assumed state used to determine a change. + auto AssumedState = S.getAssumed(); + + // Liveness information to exclude dead users. + // TODO: Take the FnPos once we have call site specific liveness information. + const auto &LivenessAA = A.getAAFor<AAIsDead>( + *this, IRPosition::function(*IRP.getAssociatedFunction())); + + // Visit and expand uses until all are analyzed or a fixpoint is reached. + for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) { + const Use *U = Uses[i]; + Instruction *UserI = cast<Instruction>(U->getUser()); + LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI + << " [Dead: " << (LivenessAA.isAssumedDead(UserI)) + << "]\n"); + if (LivenessAA.isAssumedDead(UserI)) + continue; + + // Check if the users of UserI should also be visited. + if (followUsersOfUseIn(A, U, UserI)) + for (const Use &UserIUse : UserI->uses()) + Uses.insert(&UserIUse); + + // If UserI might touch memory we analyze the use in detail. + if (UserI->mayReadOrWriteMemory()) + analyzeUseIn(A, U, UserI); + } + + return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED + : ChangeStatus::UNCHANGED; } -ChangeStatus AAWillReturnFunction::updateImpl(Attributor &A) { - Function &F = getAnchorScope(); +bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U, + const Instruction *UserI) { + // The loaded value is unrelated to the pointer argument, no need to + // follow the users of the load. + if (isa<LoadInst>(UserI)) + return false; - // The map from instruction opcodes to those instructions in the function. - auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F); + // By default we follow all uses assuming UserI might leak information on U, + // we have special handling for call sites operands though. + ImmutableCallSite ICS(UserI); + if (!ICS || !ICS.isArgOperand(U)) + return true; - for (unsigned Opcode : - {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr, - (unsigned)Instruction::Call}) { - for (Instruction *I : OpcodeInstMap[Opcode]) { - auto ICS = ImmutableCallSite(I); + // If the use is a call argument known not to be captured, the users of + // the call do not need to be visited because they have to be unrelated to + // the input. Note that this check is not trivial even though we disallow + // general capturing of the underlying argument. The reason is that the + // call might the argument "through return", which we allow and for which we + // need to check call users. + unsigned ArgNo = ICS.getArgumentNo(U); + const auto &ArgNoCaptureAA = + A.getAAFor<AANoCapture>(*this, IRPosition::callsite_argument(ICS, ArgNo)); + return !ArgNoCaptureAA.isAssumedNoCapture(); +} - if (ICS.hasFnAttr(Attribute::WillReturn)) - continue; +void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U, + const Instruction *UserI) { + assert(UserI->mayReadOrWriteMemory()); - auto *WillReturnAA = A.getAAFor<AAWillReturn>(*this, *I); - if (!WillReturnAA || !WillReturnAA->isAssumedWillReturn()) { - indicatePessimisticFixpoint(); - return ChangeStatus::CHANGED; - } + switch (UserI->getOpcode()) { + default: + // TODO: Handle all atomics and other side-effect operations we know of. + break; + case Instruction::Load: + // Loads cause the NO_READS property to disappear. + removeAssumedBits(NO_READS); + return; + + case Instruction::Store: + // Stores cause the NO_WRITES property to disappear if the use is the + // pointer operand. Note that we do assume that capturing was taken care of + // somewhere else. + if (cast<StoreInst>(UserI)->getPointerOperand() == U->get()) + removeAssumedBits(NO_WRITES); + return; - auto *NoRecurseAA = A.getAAFor<AANoRecurse>(*this, *I); + case Instruction::Call: + case Instruction::CallBr: + case Instruction::Invoke: { + // For call sites we look at the argument memory behavior attribute (this + // could be recursive!) in order to restrict our own state. + ImmutableCallSite ICS(UserI); - // FIXME: (i) Prohibit any recursion for now. - // (ii) AANoRecurse isn't implemented yet so currently any call is - // regarded as having recursion. - // Code below should be - // if ((!NoRecurseAA || !NoRecurseAA->isAssumedNoRecurse()) && - if (!NoRecurseAA && !ICS.hasFnAttr(Attribute::NoRecurse)) { - indicatePessimisticFixpoint(); - return ChangeStatus::CHANGED; - } + // Give up on operand bundles. + if (ICS.isBundleOperand(U)) { + indicatePessimisticFixpoint(); + return; } + + // Calling a function does read the function pointer, maybe write it if the + // function is self-modifying. + if (ICS.isCallee(U)) { + removeAssumedBits(NO_READS); + break; + } + + // Adjust the possible access behavior based on the information on the + // argument. + unsigned ArgNo = ICS.getArgumentNo(U); + const IRPosition &ArgPos = IRPosition::callsite_argument(ICS, ArgNo); + const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos); + // "assumed" has at most the same bits as the MemBehaviorAA assumed + // and at least "known". + intersectAssumedBits(MemBehaviorAA.getAssumed()); + return; } + }; - return ChangeStatus::UNCHANGED; + // Generally, look at the "may-properties" and adjust the assumed state if we + // did not trigger special handling before. + if (UserI->mayReadFromMemory()) + removeAssumedBits(NO_READS); + if (UserI->mayWriteToMemory()) + removeAssumedBits(NO_WRITES); } +/// ------------------ Value Constant Range Attribute ------------------------- + +struct AAValueConstantRangeImpl : AAValueConstantRange { + using StateType = IntegerRangeState; + AAValueConstantRangeImpl(const IRPosition &IRP) : AAValueConstantRange(IRP) {} + + /// See AbstractAttribute::getAsStr(). + const std::string getAsStr() const override { + std::string Str; + llvm::raw_string_ostream OS(Str); + OS << "range(" << getBitWidth() << ")<"; + getKnown().print(OS); + OS << " / "; + getAssumed().print(OS); + OS << ">"; + return OS.str(); + } + + /// Helper function to get a SCEV expr for the associated value at program + /// point \p I. + const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const { + if (!getAnchorScope()) + return nullptr; + + ScalarEvolution *SE = + A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>( + *getAnchorScope()); + + LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>( + *getAnchorScope()); + + if (!SE || !LI) + return nullptr; + + const SCEV *S = SE->getSCEV(&getAssociatedValue()); + if (!I) + return S; + + return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent())); + } + + /// Helper function to get a range from SCEV for the associated value at + /// program point \p I. + ConstantRange getConstantRangeFromSCEV(Attributor &A, + const Instruction *I = nullptr) const { + if (!getAnchorScope()) + return getWorstState(getBitWidth()); + + ScalarEvolution *SE = + A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>( + *getAnchorScope()); + + const SCEV *S = getSCEV(A, I); + if (!SE || !S) + return getWorstState(getBitWidth()); + + return SE->getUnsignedRange(S); + } + + /// Helper function to get a range from LVI for the associated value at + /// program point \p I. + ConstantRange + getConstantRangeFromLVI(Attributor &A, + const Instruction *CtxI = nullptr) const { + if (!getAnchorScope()) + return getWorstState(getBitWidth()); + + LazyValueInfo *LVI = + A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>( + *getAnchorScope()); + + if (!LVI || !CtxI) + return getWorstState(getBitWidth()); + return LVI->getConstantRange(&getAssociatedValue(), + const_cast<BasicBlock *>(CtxI->getParent()), + const_cast<Instruction *>(CtxI)); + } + + /// See AAValueConstantRange::getKnownConstantRange(..). + ConstantRange + getKnownConstantRange(Attributor &A, + const Instruction *CtxI = nullptr) const override { + if (!CtxI || CtxI == getCtxI()) + return getKnown(); + + ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI); + ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI); + return getKnown().intersectWith(SCEVR).intersectWith(LVIR); + } + + /// See AAValueConstantRange::getAssumedConstantRange(..). + ConstantRange + getAssumedConstantRange(Attributor &A, + const Instruction *CtxI = nullptr) const override { + // TODO: Make SCEV use Attributor assumption. + // We may be able to bound a variable range via assumptions in + // Attributor. ex.) If x is assumed to be in [1, 3] and y is known to + // evolve to x^2 + x, then we can say that y is in [2, 12]. + + if (!CtxI || CtxI == getCtxI()) + return getAssumed(); + + ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI); + ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI); + return getAssumed().intersectWith(SCEVR).intersectWith(LVIR); + } + + /// See AbstractAttribute::initialize(..). + void initialize(Attributor &A) override { + // Intersect a range given by SCEV. + intersectKnown(getConstantRangeFromSCEV(A, getCtxI())); + + // Intersect a range given by LVI. + intersectKnown(getConstantRangeFromLVI(A, getCtxI())); + } + + /// Helper function to create MDNode for range metadata. + static MDNode * + getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx, + const ConstantRange &AssumedConstantRange) { + Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get( + Ty, AssumedConstantRange.getLower())), + ConstantAsMetadata::get(ConstantInt::get( + Ty, AssumedConstantRange.getUpper()))}; + return MDNode::get(Ctx, LowAndHigh); + } + + /// Return true if \p Assumed is included in \p KnownRanges. + static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) { + + if (Assumed.isFullSet()) + return false; + + if (!KnownRanges) + return true; + + // If multiple ranges are annotated in IR, we give up to annotate assumed + // range for now. + + // TODO: If there exists a known range which containts assumed range, we + // can say assumed range is better. + if (KnownRanges->getNumOperands() > 2) + return false; + + ConstantInt *Lower = + mdconst::extract<ConstantInt>(KnownRanges->getOperand(0)); + ConstantInt *Upper = + mdconst::extract<ConstantInt>(KnownRanges->getOperand(1)); + + ConstantRange Known(Lower->getValue(), Upper->getValue()); + return Known.contains(Assumed) && Known != Assumed; + } + + /// Helper function to set range metadata. + static bool + setRangeMetadataIfisBetterRange(Instruction *I, + const ConstantRange &AssumedConstantRange) { + auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range); + if (isBetterRange(AssumedConstantRange, OldRangeMD)) { + if (!AssumedConstantRange.isEmptySet()) { + I->setMetadata(LLVMContext::MD_range, + getMDNodeForConstantRange(I->getType(), I->getContext(), + AssumedConstantRange)); + return true; + } + } + return false; + } + + /// See AbstractAttribute::manifest() + ChangeStatus manifest(Attributor &A) override { + ChangeStatus Changed = ChangeStatus::UNCHANGED; + ConstantRange AssumedConstantRange = getAssumedConstantRange(A); + assert(!AssumedConstantRange.isFullSet() && "Invalid state"); + + auto &V = getAssociatedValue(); + if (!AssumedConstantRange.isEmptySet() && + !AssumedConstantRange.isSingleElement()) { + if (Instruction *I = dyn_cast<Instruction>(&V)) + if (isa<CallInst>(I) || isa<LoadInst>(I)) + if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange)) + Changed = ChangeStatus::CHANGED; + } + + return Changed; + } +}; + +struct AAValueConstantRangeArgument final : public AAValueConstantRangeImpl { + + AAValueConstantRangeArgument(const IRPosition &IRP) + : AAValueConstantRangeImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Use AAArgumentFromCallSiteArguments + + IntegerRangeState S(getBitWidth()); + clampCallSiteArgumentStates<AAValueConstantRange, IntegerRangeState>( + A, *this, S); + + // TODO: If we know we visited all incoming values, thus no are assumed + // dead, we can take the known information from the state T. + return clampStateAndIndicateChange<IntegerRangeState>(this->getState(), S); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_ARG_ATTR(value_range) + } +}; + +struct AAValueConstantRangeReturned : AAValueConstantRangeImpl { + AAValueConstantRangeReturned(const IRPosition &IRP) + : AAValueConstantRangeImpl(IRP) {} + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + // TODO: Use AAReturnedFromReturnedValues + + // TODO: If we know we visited all returned values, thus no are assumed + // dead, we can take the known information from the state T. + + IntegerRangeState S(getBitWidth()); + + clampReturnedValueStates<AAValueConstantRange, IntegerRangeState>(A, *this, + S); + return clampStateAndIndicateChange<StateType>(this->getState(), S); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FNRET_ATTR(value_range) + } +}; + +struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { + AAValueConstantRangeFloating(const IRPosition &IRP) + : AAValueConstantRangeImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + AAValueConstantRange::initialize(A); + Value &V = getAssociatedValue(); + + if (auto *C = dyn_cast<ConstantInt>(&V)) { + unionAssumed(ConstantRange(C->getValue())); + indicateOptimisticFixpoint(); + return; + } + + if (isa<UndefValue>(&V)) { + indicateOptimisticFixpoint(); + return; + } + + if (auto *I = dyn_cast<Instruction>(&V)) + if (isa<BinaryOperator>(I) || isa<CmpInst>(I)) { + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + + if (LHS->getType()->isIntegerTy() && RHS->getType()->isIntegerTy()) + return; + } + + // If it is a load instruction with range metadata, use it. + if (LoadInst *LI = dyn_cast<LoadInst>(&V)) + if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) { + intersectKnown(getConstantRangeFromMetadata(*RangeMD)); + return; + } + + // Otherwise we give up. + indicatePessimisticFixpoint(); + + LLVM_DEBUG(dbgs() << "[Attributor][AAValueConstantRange] We give up: " + << getAssociatedValue()); + } + + bool calculateBinaryOperator(Attributor &A, BinaryOperator *BinOp, + IntegerRangeState &T, Instruction *CtxI) { + Value *LHS = BinOp->getOperand(0); + Value *RHS = BinOp->getOperand(1); + + auto &LHSAA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS)); + auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI); + + auto &RHSAA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS)); + auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI); + + auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange); + + T.unionAssumed(AssumedRange); + + // TODO: Track a known state too. + + return T.isValidState(); + } + + bool calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T, + Instruction *CtxI) { + Value *LHS = CmpI->getOperand(0); + Value *RHS = CmpI->getOperand(1); + + auto &LHSAA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS)); + auto &RHSAA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS)); + + auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI); + auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI); + // If one of them is empty set, we can't decide. + if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet()) + return true; + + bool MustTrue = false, MustFalse = false; + + auto AllowedRegion = + ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange); + + auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion( + CmpI->getPredicate(), RHSAARange); + + if (AllowedRegion.intersectWith(LHSAARange).isEmptySet()) + MustFalse = true; + + if (SatisfyingRegion.contains(LHSAARange)) + MustTrue = true; + + assert((!MustTrue || !MustFalse) && + "Either MustTrue or MustFalse should be false!"); + + if (MustTrue) + T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1))); + else if (MustFalse) + T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0))); + else + T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true)); + + LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA + << " " << RHSAA << "\n"); + + // TODO: Track a known state too. + return T.isValidState(); + } + + /// See AbstractAttribute::updateImpl(...). + ChangeStatus updateImpl(Attributor &A) override { + Instruction *CtxI = getCtxI(); + auto VisitValueCB = [&](Value &V, IntegerRangeState &T, + bool Stripped) -> bool { + Instruction *I = dyn_cast<Instruction>(&V); + if (!I) { + + // If the value is not instruction, we query AA to Attributor. + const auto &AA = + A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V)); + + // Clamp operator is not used to utilize a program point CtxI. + T.unionAssumed(AA.getAssumedConstantRange(A, CtxI)); + + return T.isValidState(); + } + + if (auto *BinOp = dyn_cast<BinaryOperator>(I)) + return calculateBinaryOperator(A, BinOp, T, CtxI); + else if (auto *CmpI = dyn_cast<CmpInst>(I)) + return calculateCmpInst(A, CmpI, T, CtxI); + else { + // Give up with other instructions. + // TODO: Add other instructions + + T.indicatePessimisticFixpoint(); + return false; + } + }; + + IntegerRangeState T(getBitWidth()); + + if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>( + A, getIRPosition(), *this, T, VisitValueCB)) + return indicatePessimisticFixpoint(); + + return clampStateAndIndicateChange(getState(), T); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_FLOATING_ATTR(value_range) + } +}; + +struct AAValueConstantRangeFunction : AAValueConstantRangeImpl { + AAValueConstantRangeFunction(const IRPosition &IRP) + : AAValueConstantRangeImpl(IRP) {} + + /// See AbstractAttribute::initialize(...). + ChangeStatus updateImpl(Attributor &A) override { + llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will " + "not be called"); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) } +}; + +struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction { + AAValueConstantRangeCallSite(const IRPosition &IRP) + : AAValueConstantRangeFunction(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) } +}; + +struct AAValueConstantRangeCallSiteReturned : AAValueConstantRangeReturned { + AAValueConstantRangeCallSiteReturned(const IRPosition &IRP) + : AAValueConstantRangeReturned(IRP) {} + + /// See AbstractAttribute::initialize(...). + void initialize(Attributor &A) override { + // If it is a load instruction with range metadata, use the metadata. + if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue())) + if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range)) + intersectKnown(getConstantRangeFromMetadata(*RangeMD)); + + AAValueConstantRangeReturned::initialize(A); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_CSRET_ATTR(value_range) + } +}; +struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating { + AAValueConstantRangeCallSiteArgument(const IRPosition &IRP) + : AAValueConstantRangeFloating(IRP) {} + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override { + STATS_DECLTRACK_CSARG_ATTR(value_range) + } +}; /// ---------------------------------------------------------------------------- /// Attributor /// ---------------------------------------------------------------------------- -bool Attributor::checkForAllCallSites(Function &F, - std::function<bool(CallSite)> &Pred, - bool RequireAllCallSites) { +bool Attributor::isAssumedDead(const AbstractAttribute &AA, + const AAIsDead *LivenessAA) { + const Instruction *CtxI = AA.getIRPosition().getCtxI(); + if (!CtxI) + return false; + + // TODO: Find a good way to utilize fine and coarse grained liveness + // information. + if (!LivenessAA) + LivenessAA = + &getAAFor<AAIsDead>(AA, IRPosition::function(*CtxI->getFunction()), + /* TrackDependence */ false); + + // Don't check liveness for AAIsDead. + if (&AA == LivenessAA) + return false; + + if (!LivenessAA->isAssumedDead(CtxI)) + return false; + + // We actually used liveness information so we have to record a dependence. + recordDependence(*LivenessAA, AA, DepClassTy::OPTIONAL); + + return true; +} + +bool Attributor::checkForAllUses( + const function_ref<bool(const Use &, bool &)> &Pred, + const AbstractAttribute &QueryingAA, const Value &V) { + const IRPosition &IRP = QueryingAA.getIRPosition(); + SmallVector<const Use *, 16> Worklist; + SmallPtrSet<const Use *, 16> Visited; + + for (const Use &U : V.uses()) + Worklist.push_back(&U); + + LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size() + << " initial uses to check\n"); + + if (Worklist.empty()) + return true; + + bool AnyDead = false; + const Function *ScopeFn = IRP.getAnchorScope(); + const auto *LivenessAA = + ScopeFn ? &getAAFor<AAIsDead>(QueryingAA, IRPosition::function(*ScopeFn), + /* TrackDependence */ false) + : nullptr; + + while (!Worklist.empty()) { + const Use *U = Worklist.pop_back_val(); + if (!Visited.insert(U).second) + continue; + LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << "\n"); + if (Instruction *UserI = dyn_cast<Instruction>(U->getUser())) + if (LivenessAA && LivenessAA->isAssumedDead(UserI)) { + LLVM_DEBUG(dbgs() << "[Attributor] Dead user: " << *UserI << ": " + << *LivenessAA << "\n"); + AnyDead = true; + continue; + } + + bool Follow = false; + if (!Pred(*U, Follow)) + return false; + if (!Follow) + continue; + for (const Use &UU : U->getUser()->uses()) + Worklist.push_back(&UU); + } + + if (AnyDead) + recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL); + + return true; +} + +bool Attributor::checkForAllCallSites( + const function_ref<bool(AbstractCallSite)> &Pred, + const AbstractAttribute &QueryingAA, bool RequireAllCallSites) { // We can try to determine information from // the call sites. However, this is only possible all call sites are known, // hence the function has internal linkage. - if (RequireAllCallSites && !F.hasInternalLinkage()) { + const IRPosition &IRP = QueryingAA.getIRPosition(); + const Function *AssociatedFunction = IRP.getAssociatedFunction(); + if (!AssociatedFunction) { + LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP + << "\n"); + return false; + } + + return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites, + &QueryingAA); +} + +bool Attributor::checkForAllCallSites( + const function_ref<bool(AbstractCallSite)> &Pred, const Function &Fn, + bool RequireAllCallSites, const AbstractAttribute *QueryingAA) { + if (RequireAllCallSites && !Fn.hasLocalLinkage()) { LLVM_DEBUG( dbgs() - << "Attributor: Function " << F.getName() + << "[Attributor] Function " << Fn.getName() << " has no internal linkage, hence not all call sites are known\n"); return false; } - for (const Use &U : F.uses()) { + for (const Use &U : Fn.uses()) { + AbstractCallSite ACS(&U); + if (!ACS) { + LLVM_DEBUG(dbgs() << "[Attributor] Function " << Fn.getName() + << " has non call site use " << *U.get() << " in " + << *U.getUser() << "\n"); + // BlockAddress users are allowed. + if (isa<BlockAddress>(U.getUser())) + continue; + return false; + } + + Instruction *I = ACS.getInstruction(); + Function *Caller = I->getFunction(); + + const auto *LivenessAA = + lookupAAFor<AAIsDead>(IRPosition::function(*Caller), QueryingAA, + /* TrackDependence */ false); + + // Skip dead calls. + if (LivenessAA && LivenessAA->isAssumedDead(I)) { + // We actually used liveness information so we have to record a + // dependence. + if (QueryingAA) + recordDependence(*LivenessAA, *QueryingAA, DepClassTy::OPTIONAL); + continue; + } - CallSite CS(U.getUser()); - if (!CS || !CS.isCallee(&U) || !CS.getCaller()->hasExactDefinition()) { + const Use *EffectiveUse = + ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U; + if (!ACS.isCallee(EffectiveUse)) { if (!RequireAllCallSites) continue; - - LLVM_DEBUG(dbgs() << "Attributor: User " << *U.getUser() - << " is an invalid use of " << F.getName() << "\n"); + LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser() + << " is an invalid use of " << Fn.getName() << "\n"); return false; } - if (Pred(CS)) + if (Pred(ACS)) continue; - LLVM_DEBUG(dbgs() << "Attributor: Call site callback failed for " - << *CS.getInstruction() << "\n"); + LLVM_DEBUG(dbgs() << "[Attributor] Call site callback failed for " + << *ACS.getInstruction() << "\n"); return false; } return true; } -ChangeStatus Attributor::run() { - // Initialize all abstract attributes. - for (AbstractAttribute *AA : AllAbstractAttributes) - AA->initialize(*this); +bool Attributor::checkForAllReturnedValuesAndReturnInsts( + const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> + &Pred, + const AbstractAttribute &QueryingAA) { + + const IRPosition &IRP = QueryingAA.getIRPosition(); + // Since we need to provide return instructions we have to have an exact + // definition. + const Function *AssociatedFunction = IRP.getAssociatedFunction(); + if (!AssociatedFunction) + return false; + + // If this is a call site query we use the call site specific return values + // and liveness information. + // TODO: use the function scope once we have call site AAReturnedValues. + const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction); + const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP); + if (!AARetVal.getState().isValidState()) + return false; + + return AARetVal.checkForAllReturnedValuesAndReturnInsts(Pred); +} + +bool Attributor::checkForAllReturnedValues( + const function_ref<bool(Value &)> &Pred, + const AbstractAttribute &QueryingAA) { + + const IRPosition &IRP = QueryingAA.getIRPosition(); + const Function *AssociatedFunction = IRP.getAssociatedFunction(); + if (!AssociatedFunction) + return false; + + // TODO: use the function scope once we have call site AAReturnedValues. + const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction); + const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP); + if (!AARetVal.getState().isValidState()) + return false; + + return AARetVal.checkForAllReturnedValuesAndReturnInsts( + [&](Value &RV, const SmallSetVector<ReturnInst *, 4> &) { + return Pred(RV); + }); +} + +static bool +checkForAllInstructionsImpl(InformationCache::OpcodeInstMapTy &OpcodeInstMap, + const function_ref<bool(Instruction &)> &Pred, + const AAIsDead *LivenessAA, bool &AnyDead, + const ArrayRef<unsigned> &Opcodes) { + for (unsigned Opcode : Opcodes) { + for (Instruction *I : OpcodeInstMap[Opcode]) { + // Skip dead instructions. + if (LivenessAA && LivenessAA->isAssumedDead(I)) { + AnyDead = true; + continue; + } + + if (!Pred(*I)) + return false; + } + } + return true; +} + +bool Attributor::checkForAllInstructions( + const llvm::function_ref<bool(Instruction &)> &Pred, + const AbstractAttribute &QueryingAA, const ArrayRef<unsigned> &Opcodes) { + + const IRPosition &IRP = QueryingAA.getIRPosition(); + // Since we need to provide instructions we have to have an exact definition. + const Function *AssociatedFunction = IRP.getAssociatedFunction(); + if (!AssociatedFunction) + return false; + + // TODO: use the function scope once we have call site AAReturnedValues. + const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction); + const auto &LivenessAA = + getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false); + bool AnyDead = false; + + auto &OpcodeInstMap = + InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction); + if (!checkForAllInstructionsImpl(OpcodeInstMap, Pred, &LivenessAA, AnyDead, + Opcodes)) + return false; + + // If we actually used liveness information so we have to record a dependence. + if (AnyDead) + recordDependence(LivenessAA, QueryingAA, DepClassTy::OPTIONAL); + + return true; +} + +bool Attributor::checkForAllReadWriteInstructions( + const llvm::function_ref<bool(Instruction &)> &Pred, + AbstractAttribute &QueryingAA) { + + const Function *AssociatedFunction = + QueryingAA.getIRPosition().getAssociatedFunction(); + if (!AssociatedFunction) + return false; + + // TODO: use the function scope once we have call site AAReturnedValues. + const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction); + const auto &LivenessAA = + getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false); + bool AnyDead = false; + + for (Instruction *I : + InfoCache.getReadOrWriteInstsForFunction(*AssociatedFunction)) { + // Skip dead instructions. + if (LivenessAA.isAssumedDead(I)) { + AnyDead = true; + continue; + } + + if (!Pred(*I)) + return false; + } + // If we actually used liveness information so we have to record a dependence. + if (AnyDead) + recordDependence(LivenessAA, QueryingAA, DepClassTy::OPTIONAL); + + return true; +} + +ChangeStatus Attributor::run(Module &M) { LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized " << AllAbstractAttributes.size() << " abstract attributes.\n"); @@ -1367,41 +5855,107 @@ ChangeStatus Attributor::run() { unsigned IterationCounter = 1; SmallVector<AbstractAttribute *, 64> ChangedAAs; - SetVector<AbstractAttribute *> Worklist; + SetVector<AbstractAttribute *> Worklist, InvalidAAs; Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end()); + bool RecomputeDependences = false; + do { + // Remember the size to determine new attributes. + size_t NumAAs = AllAbstractAttributes.size(); LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter << ", Worklist size: " << Worklist.size() << "\n"); + // For invalid AAs we can fix dependent AAs that have a required dependence, + // thereby folding long dependence chains in a single step without the need + // to run updates. + for (unsigned u = 0; u < InvalidAAs.size(); ++u) { + AbstractAttribute *InvalidAA = InvalidAAs[u]; + auto &QuerriedAAs = QueryMap[InvalidAA]; + LLVM_DEBUG(dbgs() << "[Attributor] InvalidAA: " << *InvalidAA << " has " + << QuerriedAAs.RequiredAAs.size() << "/" + << QuerriedAAs.OptionalAAs.size() + << " required/optional dependences\n"); + for (AbstractAttribute *DepOnInvalidAA : QuerriedAAs.RequiredAAs) { + AbstractState &DOIAAState = DepOnInvalidAA->getState(); + DOIAAState.indicatePessimisticFixpoint(); + ++NumAttributesFixedDueToRequiredDependences; + assert(DOIAAState.isAtFixpoint() && "Expected fixpoint state!"); + if (!DOIAAState.isValidState()) + InvalidAAs.insert(DepOnInvalidAA); + } + if (!RecomputeDependences) + Worklist.insert(QuerriedAAs.OptionalAAs.begin(), + QuerriedAAs.OptionalAAs.end()); + } + + // If dependences (=QueryMap) are recomputed we have to look at all abstract + // attributes again, regardless of what changed in the last iteration. + if (RecomputeDependences) { + LLVM_DEBUG( + dbgs() << "[Attributor] Run all AAs to recompute dependences\n"); + QueryMap.clear(); + ChangedAAs.clear(); + Worklist.insert(AllAbstractAttributes.begin(), + AllAbstractAttributes.end()); + } + // Add all abstract attributes that are potentially dependent on one that // changed to the work list. for (AbstractAttribute *ChangedAA : ChangedAAs) { auto &QuerriedAAs = QueryMap[ChangedAA]; - Worklist.insert(QuerriedAAs.begin(), QuerriedAAs.end()); + Worklist.insert(QuerriedAAs.OptionalAAs.begin(), + QuerriedAAs.OptionalAAs.end()); + Worklist.insert(QuerriedAAs.RequiredAAs.begin(), + QuerriedAAs.RequiredAAs.end()); } - // Reset the changed set. + LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter + << ", Worklist+Dependent size: " << Worklist.size() + << "\n"); + + // Reset the changed and invalid set. ChangedAAs.clear(); + InvalidAAs.clear(); // Update all abstract attribute in the work list and record the ones that // changed. for (AbstractAttribute *AA : Worklist) - if (AA->update(*this) == ChangeStatus::CHANGED) - ChangedAAs.push_back(AA); + if (!AA->getState().isAtFixpoint() && !isAssumedDead(*AA, nullptr)) { + QueriedNonFixAA = false; + if (AA->update(*this) == ChangeStatus::CHANGED) { + ChangedAAs.push_back(AA); + if (!AA->getState().isValidState()) + InvalidAAs.insert(AA); + } else if (!QueriedNonFixAA) { + // If the attribute did not query any non-fix information, the state + // will not change and we can indicate that right away. + AA->getState().indicateOptimisticFixpoint(); + } + } + + // Check if we recompute the dependences in the next iteration. + RecomputeDependences = (DepRecomputeInterval > 0 && + IterationCounter % DepRecomputeInterval == 0); + + // Add attributes to the changed set if they have been created in the last + // iteration. + ChangedAAs.append(AllAbstractAttributes.begin() + NumAAs, + AllAbstractAttributes.end()); // Reset the work list and repopulate with the changed abstract attributes. // Note that dependent ones are added above. Worklist.clear(); Worklist.insert(ChangedAAs.begin(), ChangedAAs.end()); - } while (!Worklist.empty() && ++IterationCounter < MaxFixpointIterations); + } while (!Worklist.empty() && (IterationCounter++ < MaxFixpointIterations || + VerifyMaxFixpointIterations)); LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: " << IterationCounter << "/" << MaxFixpointIterations << " iterations\n"); - bool FinishedAtFixpoint = Worklist.empty(); + size_t NumFinalAAs = AllAbstractAttributes.size(); // Reset abstract arguments not settled in a sound fixpoint by now. This // happens when we stopped the fixpoint iteration early. Note that only the @@ -1422,7 +5976,10 @@ ChangeStatus Attributor::run() { } auto &QuerriedAAs = QueryMap[ChangedAA]; - ChangedAAs.append(QuerriedAAs.begin(), QuerriedAAs.end()); + ChangedAAs.append(QuerriedAAs.OptionalAAs.begin(), + QuerriedAAs.OptionalAAs.end()); + ChangedAAs.append(QuerriedAAs.RequiredAAs.begin(), + QuerriedAAs.RequiredAAs.end()); } LLVM_DEBUG({ @@ -1448,8 +6005,14 @@ ChangeStatus Attributor::run() { if (!State.isValidState()) continue; + // Skip dead code. + if (isAssumedDead(*AA, nullptr)) + continue; // Manifest the state and record if we changed the IR. ChangeStatus LocalChange = AA->manifest(*this); + if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled()) + AA->trackStatistics(); + ManifestChange = ManifestChange | LocalChange; NumAtFixpoint++; @@ -1462,69 +6025,407 @@ ChangeStatus Attributor::run() { << " arguments while " << NumAtFixpoint << " were in a valid fixpoint state\n"); - // If verification is requested, we finished this run at a fixpoint, and the - // IR was changed, we re-run the whole fixpoint analysis, starting at - // re-initialization of the arguments. This re-run should not result in an IR - // change. Though, the (virtual) state of attributes at the end of the re-run - // might be more optimistic than the known state or the IR state if the better - // state cannot be manifested. - if (VerifyAttributor && FinishedAtFixpoint && - ManifestChange == ChangeStatus::CHANGED) { - VerifyAttributor = false; - ChangeStatus VerifyStatus = run(); - if (VerifyStatus != ChangeStatus::UNCHANGED) - llvm_unreachable( - "Attributor verification failed, re-run did result in an IR change " - "even after a fixpoint was reached in the original run. (False " - "positives possible!)"); - VerifyAttributor = true; - } - NumAttributesManifested += NumManifested; NumAttributesValidFixpoint += NumAtFixpoint; + (void)NumFinalAAs; + assert( + NumFinalAAs == AllAbstractAttributes.size() && + "Expected the final number of abstract attributes to remain unchanged!"); + + // Delete stuff at the end to avoid invalid references and a nice order. + { + LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least " + << ToBeDeletedFunctions.size() << " functions and " + << ToBeDeletedBlocks.size() << " blocks and " + << ToBeDeletedInsts.size() << " instructions and " + << ToBeChangedUses.size() << " uses\n"); + + SmallVector<Instruction *, 32> DeadInsts; + SmallVector<Instruction *, 32> TerminatorsToFold; + + for (auto &It : ToBeChangedUses) { + Use *U = It.first; + Value *NewV = It.second; + Value *OldV = U->get(); + LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser() + << " instead of " << *OldV << "\n"); + U->set(NewV); + if (Instruction *I = dyn_cast<Instruction>(OldV)) + if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) && + isInstructionTriviallyDead(I)) { + DeadInsts.push_back(I); + } + if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) { + Instruction *UserI = cast<Instruction>(U->getUser()); + if (isa<UndefValue>(NewV)) { + ToBeChangedToUnreachableInsts.insert(UserI); + } else { + TerminatorsToFold.push_back(UserI); + } + } + } + for (auto &V : InvokeWithDeadSuccessor) + if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) { + bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind); + bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn); + bool Invoke2CallAllowed = + !AAIsDeadFunction::mayCatchAsynchronousExceptions( + *II->getFunction()); + assert((UnwindBBIsDead || NormalBBIsDead) && + "Invoke does not have dead successors!"); + BasicBlock *BB = II->getParent(); + BasicBlock *NormalDestBB = II->getNormalDest(); + if (UnwindBBIsDead) { + Instruction *NormalNextIP = &NormalDestBB->front(); + if (Invoke2CallAllowed) { + changeToCall(II); + NormalNextIP = BB->getTerminator(); + } + if (NormalBBIsDead) + ToBeChangedToUnreachableInsts.insert(NormalNextIP); + } else { + assert(NormalBBIsDead && "Broken invariant!"); + if (!NormalDestBB->getUniquePredecessor()) + NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead"); + ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front()); + } + } + for (auto &V : ToBeChangedToUnreachableInsts) + if (Instruction *I = dyn_cast_or_null<Instruction>(V)) + changeToUnreachable(I, /* UseLLVMTrap */ false); + for (Instruction *I : TerminatorsToFold) + ConstantFoldTerminator(I->getParent()); + + for (Instruction *I : ToBeDeletedInsts) { + I->replaceAllUsesWith(UndefValue::get(I->getType())); + if (!isa<PHINode>(I) && isInstructionTriviallyDead(I)) + DeadInsts.push_back(I); + else + I->eraseFromParent(); + } + + RecursivelyDeleteTriviallyDeadInstructions(DeadInsts); + + if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) { + SmallVector<BasicBlock *, 8> ToBeDeletedBBs; + ToBeDeletedBBs.reserve(NumDeadBlocks); + ToBeDeletedBBs.append(ToBeDeletedBlocks.begin(), ToBeDeletedBlocks.end()); + // Actually we do not delete the blocks but squash them into a single + // unreachable but untangling branches that jump here is something we need + // to do in a more generic way. + DetatchDeadBlocks(ToBeDeletedBBs, nullptr); + STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted."); + BUILD_STAT_NAME(AAIsDead, BasicBlock) += ToBeDeletedBlocks.size(); + } + + // Identify dead internal functions and delete them. This happens outside + // the other fixpoint analysis as we might treat potentially dead functions + // as live to lower the number of iterations. If they happen to be dead, the + // below fixpoint loop will identify and eliminate them. + SmallVector<Function *, 8> InternalFns; + for (Function &F : M) + if (F.hasLocalLinkage()) + InternalFns.push_back(&F); + + bool FoundDeadFn = true; + while (FoundDeadFn) { + FoundDeadFn = false; + for (unsigned u = 0, e = InternalFns.size(); u < e; ++u) { + Function *F = InternalFns[u]; + if (!F) + continue; + + if (!checkForAllCallSites( + [this](AbstractCallSite ACS) { + return ToBeDeletedFunctions.count( + ACS.getInstruction()->getFunction()); + }, + *F, true, nullptr)) + continue; + + ToBeDeletedFunctions.insert(F); + InternalFns[u] = nullptr; + FoundDeadFn = true; + } + } + } + + STATS_DECL(AAIsDead, Function, "Number of dead functions deleted."); + BUILD_STAT_NAME(AAIsDead, Function) += ToBeDeletedFunctions.size(); + + // Rewrite the functions as requested during manifest. + ManifestChange = ManifestChange | rewriteFunctionSignatures(); + + for (Function *Fn : ToBeDeletedFunctions) { + Fn->deleteBody(); + Fn->replaceAllUsesWith(UndefValue::get(Fn->getType())); + Fn->eraseFromParent(); + } + + if (VerifyMaxFixpointIterations && + IterationCounter != MaxFixpointIterations) { + errs() << "\n[Attributor] Fixpoint iteration done after: " + << IterationCounter << "/" << MaxFixpointIterations + << " iterations\n"; + llvm_unreachable("The fixpoint was not reached with exactly the number of " + "specified iterations!"); + } + return ManifestChange; } -void Attributor::identifyDefaultAbstractAttributes( - Function &F, InformationCache &InfoCache, - DenseSet</* Attribute::AttrKind */ unsigned> *Whitelist) { +bool Attributor::registerFunctionSignatureRewrite( + Argument &Arg, ArrayRef<Type *> ReplacementTypes, + ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB, + ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) { - // Every function can be nounwind. - registerAA(*new AANoUnwindFunction(F, InfoCache)); + auto CallSiteCanBeChanged = [](AbstractCallSite ACS) { + // Forbid must-tail calls for now. + return !ACS.isCallbackCall() && !ACS.getCallSite().isMustTailCall(); + }; - // Every function might be marked "nosync" - registerAA(*new AANoSyncFunction(F, InfoCache)); + Function *Fn = Arg.getParent(); + // Avoid var-arg functions for now. + if (Fn->isVarArg()) { + LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n"); + return false; + } - // Every function might be "no-free". - registerAA(*new AANoFreeFunction(F, InfoCache)); + // Avoid functions with complicated argument passing semantics. + AttributeList FnAttributeList = Fn->getAttributes(); + if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) || + FnAttributeList.hasAttrSomewhere(Attribute::StructRet) || + FnAttributeList.hasAttrSomewhere(Attribute::InAlloca)) { + LLVM_DEBUG( + dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n"); + return false; + } - // Return attributes are only appropriate if the return type is non void. - Type *ReturnType = F.getReturnType(); - if (!ReturnType->isVoidTy()) { - // Argument attribute "returned" --- Create only one per function even - // though it is an argument attribute. - if (!Whitelist || Whitelist->count(AAReturnedValues::ID)) - registerAA(*new AAReturnedValuesImpl(F, InfoCache)); + // Avoid callbacks for now. + if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr)) { + LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n"); + return false; + } + + auto InstPred = [](Instruction &I) { + if (auto *CI = dyn_cast<CallInst>(&I)) + return !CI->isMustTailCall(); + return true; + }; - // Every function with pointer return type might be marked nonnull. - if (ReturnType->isPointerTy() && - (!Whitelist || Whitelist->count(AANonNullReturned::ID))) - registerAA(*new AANonNullReturned(F, InfoCache)); + // Forbid must-tail calls for now. + // TODO: + bool AnyDead; + auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn); + if (!checkForAllInstructionsImpl(OpcodeInstMap, InstPred, nullptr, AnyDead, + {Instruction::Call})) { + LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite due to instructions\n"); + return false; } - // Every argument with pointer type might be marked nonnull. - for (Argument &Arg : F.args()) { - if (Arg.getType()->isPointerTy()) - registerAA(*new AANonNullArgument(Arg, InfoCache)); + SmallVectorImpl<ArgumentReplacementInfo *> &ARIs = ArgumentReplacementMap[Fn]; + if (ARIs.size() == 0) + ARIs.resize(Fn->arg_size()); + + // If we have a replacement already with less than or equal new arguments, + // ignore this request. + ArgumentReplacementInfo *&ARI = ARIs[Arg.getArgNo()]; + if (ARI && ARI->getNumReplacementArgs() <= ReplacementTypes.size()) { + LLVM_DEBUG(dbgs() << "[Attributor] Existing rewrite is preferred\n"); + return false; } - // Every function might be "will-return". - registerAA(*new AAWillReturnFunction(F, InfoCache)); + // If we have a replacement already but we like the new one better, delete + // the old. + if (ARI) + delete ARI; + + // Remember the replacement. + ARI = new ArgumentReplacementInfo(*this, Arg, ReplacementTypes, + std::move(CalleeRepairCB), + std::move(ACSRepairCB)); + + return true; +} + +ChangeStatus Attributor::rewriteFunctionSignatures() { + ChangeStatus Changed = ChangeStatus::UNCHANGED; + + for (auto &It : ArgumentReplacementMap) { + Function *OldFn = It.getFirst(); + + // Deleted functions do not require rewrites. + if (ToBeDeletedFunctions.count(OldFn)) + continue; + + const SmallVectorImpl<ArgumentReplacementInfo *> &ARIs = It.getSecond(); + assert(ARIs.size() == OldFn->arg_size() && "Inconsistent state!"); + + SmallVector<Type *, 16> NewArgumentTypes; + SmallVector<AttributeSet, 16> NewArgumentAttributes; + + // Collect replacement argument types and copy over existing attributes. + AttributeList OldFnAttributeList = OldFn->getAttributes(); + for (Argument &Arg : OldFn->args()) { + if (ArgumentReplacementInfo *ARI = ARIs[Arg.getArgNo()]) { + NewArgumentTypes.append(ARI->ReplacementTypes.begin(), + ARI->ReplacementTypes.end()); + NewArgumentAttributes.append(ARI->getNumReplacementArgs(), + AttributeSet()); + } else { + NewArgumentTypes.push_back(Arg.getType()); + NewArgumentAttributes.push_back( + OldFnAttributeList.getParamAttributes(Arg.getArgNo())); + } + } + + FunctionType *OldFnTy = OldFn->getFunctionType(); + Type *RetTy = OldFnTy->getReturnType(); + + // Construct the new function type using the new arguments types. + FunctionType *NewFnTy = + FunctionType::get(RetTy, NewArgumentTypes, OldFnTy->isVarArg()); + + LLVM_DEBUG(dbgs() << "[Attributor] Function rewrite '" << OldFn->getName() + << "' from " << *OldFn->getFunctionType() << " to " + << *NewFnTy << "\n"); + + // Create the new function body and insert it into the module. + Function *NewFn = Function::Create(NewFnTy, OldFn->getLinkage(), + OldFn->getAddressSpace(), ""); + OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn); + NewFn->takeName(OldFn); + NewFn->copyAttributesFrom(OldFn); + + // Patch the pointer to LLVM function in debug info descriptor. + NewFn->setSubprogram(OldFn->getSubprogram()); + OldFn->setSubprogram(nullptr); + + // Recompute the parameter attributes list based on the new arguments for + // the function. + LLVMContext &Ctx = OldFn->getContext(); + NewFn->setAttributes(AttributeList::get( + Ctx, OldFnAttributeList.getFnAttributes(), + OldFnAttributeList.getRetAttributes(), NewArgumentAttributes)); + + // Since we have now created the new function, splice the body of the old + // function right into the new function, leaving the old rotting hulk of the + // function empty. + NewFn->getBasicBlockList().splice(NewFn->begin(), + OldFn->getBasicBlockList()); + + // Set of all "call-like" instructions that invoke the old function mapped + // to their new replacements. + SmallVector<std::pair<CallBase *, CallBase *>, 8> CallSitePairs; + + // Callback to create a new "call-like" instruction for a given one. + auto CallSiteReplacementCreator = [&](AbstractCallSite ACS) { + CallBase *OldCB = cast<CallBase>(ACS.getInstruction()); + const AttributeList &OldCallAttributeList = OldCB->getAttributes(); + + // Collect the new argument operands for the replacement call site. + SmallVector<Value *, 16> NewArgOperands; + SmallVector<AttributeSet, 16> NewArgOperandAttributes; + for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum) { + unsigned NewFirstArgNum = NewArgOperands.size(); + (void)NewFirstArgNum; // only used inside assert. + if (ArgumentReplacementInfo *ARI = ARIs[OldArgNum]) { + if (ARI->ACSRepairCB) + ARI->ACSRepairCB(*ARI, ACS, NewArgOperands); + assert(ARI->getNumReplacementArgs() + NewFirstArgNum == + NewArgOperands.size() && + "ACS repair callback did not provide as many operand as new " + "types were registered!"); + // TODO: Exose the attribute set to the ACS repair callback + NewArgOperandAttributes.append(ARI->ReplacementTypes.size(), + AttributeSet()); + } else { + NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum)); + NewArgOperandAttributes.push_back( + OldCallAttributeList.getParamAttributes(OldArgNum)); + } + } + + assert(NewArgOperands.size() == NewArgOperandAttributes.size() && + "Mismatch # argument operands vs. # argument operand attributes!"); + assert(NewArgOperands.size() == NewFn->arg_size() && + "Mismatch # argument operands vs. # function arguments!"); + + SmallVector<OperandBundleDef, 4> OperandBundleDefs; + OldCB->getOperandBundlesAsDefs(OperandBundleDefs); + + // Create a new call or invoke instruction to replace the old one. + CallBase *NewCB; + if (InvokeInst *II = dyn_cast<InvokeInst>(OldCB)) { + NewCB = + InvokeInst::Create(NewFn, II->getNormalDest(), II->getUnwindDest(), + NewArgOperands, OperandBundleDefs, "", OldCB); + } else { + auto *NewCI = CallInst::Create(NewFn, NewArgOperands, OperandBundleDefs, + "", OldCB); + NewCI->setTailCallKind(cast<CallInst>(OldCB)->getTailCallKind()); + NewCB = NewCI; + } + + // Copy over various properties and the new attributes. + uint64_t W; + if (OldCB->extractProfTotalWeight(W)) + NewCB->setProfWeight(W); + NewCB->setCallingConv(OldCB->getCallingConv()); + NewCB->setDebugLoc(OldCB->getDebugLoc()); + NewCB->takeName(OldCB); + NewCB->setAttributes(AttributeList::get( + Ctx, OldCallAttributeList.getFnAttributes(), + OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes)); + + CallSitePairs.push_back({OldCB, NewCB}); + return true; + }; + + // Use the CallSiteReplacementCreator to create replacement call sites. + bool Success = + checkForAllCallSites(CallSiteReplacementCreator, *OldFn, true, nullptr); + (void)Success; + assert(Success && "Assumed call site replacement to succeed!"); + + // Rewire the arguments. + auto OldFnArgIt = OldFn->arg_begin(); + auto NewFnArgIt = NewFn->arg_begin(); + for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); + ++OldArgNum, ++OldFnArgIt) { + if (ArgumentReplacementInfo *ARI = ARIs[OldArgNum]) { + if (ARI->CalleeRepairCB) + ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt); + NewFnArgIt += ARI->ReplacementTypes.size(); + } else { + NewFnArgIt->takeName(&*OldFnArgIt); + OldFnArgIt->replaceAllUsesWith(&*NewFnArgIt); + ++NewFnArgIt; + } + } + + // Eliminate the instructions *after* we visited all of them. + for (auto &CallSitePair : CallSitePairs) { + CallBase &OldCB = *CallSitePair.first; + CallBase &NewCB = *CallSitePair.second; + OldCB.replaceAllUsesWith(&NewCB); + OldCB.eraseFromParent(); + } + + ToBeDeletedFunctions.insert(OldFn); + + Changed = ChangeStatus::CHANGED; + } + + return Changed; +} - // Walk all instructions to find more attribute opportunities and also - // interesting instructions that might be queried by abstract attributes - // during their initialization or update. +void Attributor::initializeInformationCache(Function &F) { + + // Walk all instructions to find interesting instructions that might be + // queried by abstract attributes during their initialization or update. + // This has to happen before we create attributes. auto &ReadOrWriteInsts = InfoCache.FuncRWInstsMap[&F]; auto &InstOpcodeMap = InfoCache.FuncInstOpcodeMap[&F]; @@ -1540,13 +6441,20 @@ void Attributor::identifyDefaultAbstractAttributes( default: assert((!ImmutableCallSite(&I)) && (!isa<CallBase>(&I)) && "New call site/base instruction type needs to be known int the " - "attributor."); + "Attributor."); break; + case Instruction::Load: + // The alignment of a pointer is interesting for loads. + case Instruction::Store: + // The alignment of a pointer is interesting for stores. case Instruction::Call: case Instruction::CallBr: case Instruction::Invoke: case Instruction::CleanupRet: case Instruction::CatchSwitch: + case Instruction::AtomicRMW: + case Instruction::AtomicCmpXchg: + case Instruction::Br: case Instruction::Resume: case Instruction::Ret: IsInterestingOpcode = true; @@ -1555,18 +6463,209 @@ void Attributor::identifyDefaultAbstractAttributes( InstOpcodeMap[I.getOpcode()].push_back(&I); if (I.mayReadOrWriteMemory()) ReadOrWriteInsts.push_back(&I); + } +} + +void Attributor::recordDependence(const AbstractAttribute &FromAA, + const AbstractAttribute &ToAA, + DepClassTy DepClass) { + if (FromAA.getState().isAtFixpoint()) + return; + + if (DepClass == DepClassTy::REQUIRED) + QueryMap[&FromAA].RequiredAAs.insert( + const_cast<AbstractAttribute *>(&ToAA)); + else + QueryMap[&FromAA].OptionalAAs.insert( + const_cast<AbstractAttribute *>(&ToAA)); + QueriedNonFixAA = true; +} + +void Attributor::identifyDefaultAbstractAttributes(Function &F) { + if (!VisitedFunctions.insert(&F).second) + return; + if (F.isDeclaration()) + return; + + IRPosition FPos = IRPosition::function(F); + // Check for dead BasicBlocks in every function. + // We need dead instruction detection because we do not want to deal with + // broken IR in which SSA rules do not apply. + getOrCreateAAFor<AAIsDead>(FPos); + + // Every function might be "will-return". + getOrCreateAAFor<AAWillReturn>(FPos); + + // Every function might contain instructions that cause "undefined behavior". + getOrCreateAAFor<AAUndefinedBehavior>(FPos); + + // Every function can be nounwind. + getOrCreateAAFor<AANoUnwind>(FPos); + + // Every function might be marked "nosync" + getOrCreateAAFor<AANoSync>(FPos); + + // Every function might be "no-free". + getOrCreateAAFor<AANoFree>(FPos); + + // Every function might be "no-return". + getOrCreateAAFor<AANoReturn>(FPos); + + // Every function might be "no-recurse". + getOrCreateAAFor<AANoRecurse>(FPos); + + // Every function might be "readnone/readonly/writeonly/...". + getOrCreateAAFor<AAMemoryBehavior>(FPos); + + // Every function might be applicable for Heap-To-Stack conversion. + if (EnableHeapToStack) + getOrCreateAAFor<AAHeapToStack>(FPos); + + // Return attributes are only appropriate if the return type is non void. + Type *ReturnType = F.getReturnType(); + if (!ReturnType->isVoidTy()) { + // Argument attribute "returned" --- Create only one per function even + // though it is an argument attribute. + getOrCreateAAFor<AAReturnedValues>(FPos); + + IRPosition RetPos = IRPosition::returned(F); + + // Every returned value might be dead. + getOrCreateAAFor<AAIsDead>(RetPos); + + // Every function might be simplified. + getOrCreateAAFor<AAValueSimplify>(RetPos); + + if (ReturnType->isPointerTy()) { + + // Every function with pointer return type might be marked align. + getOrCreateAAFor<AAAlign>(RetPos); + + // Every function with pointer return type might be marked nonnull. + getOrCreateAAFor<AANonNull>(RetPos); + + // Every function with pointer return type might be marked noalias. + getOrCreateAAFor<AANoAlias>(RetPos); + + // Every function with pointer return type might be marked + // dereferenceable. + getOrCreateAAFor<AADereferenceable>(RetPos); + } + } + + for (Argument &Arg : F.args()) { + IRPosition ArgPos = IRPosition::argument(Arg); + + // Every argument might be simplified. + getOrCreateAAFor<AAValueSimplify>(ArgPos); + + if (Arg.getType()->isPointerTy()) { + // Every argument with pointer type might be marked nonnull. + getOrCreateAAFor<AANonNull>(ArgPos); + + // Every argument with pointer type might be marked noalias. + getOrCreateAAFor<AANoAlias>(ArgPos); + + // Every argument with pointer type might be marked dereferenceable. + getOrCreateAAFor<AADereferenceable>(ArgPos); + + // Every argument with pointer type might be marked align. + getOrCreateAAFor<AAAlign>(ArgPos); + + // Every argument with pointer type might be marked nocapture. + getOrCreateAAFor<AANoCapture>(ArgPos); + + // Every argument with pointer type might be marked + // "readnone/readonly/writeonly/..." + getOrCreateAAFor<AAMemoryBehavior>(ArgPos); + + // Every argument with pointer type might be marked nofree. + getOrCreateAAFor<AANoFree>(ArgPos); + } + } + + auto CallSitePred = [&](Instruction &I) -> bool { CallSite CS(&I); - if (CS && CS.getCalledFunction()) { - for (int i = 0, e = CS.getCalledFunction()->arg_size(); i < e; i++) { + if (Function *Callee = CS.getCalledFunction()) { + // Skip declerations except if annotations on their call sites were + // explicitly requested. + if (!AnnotateDeclarationCallSites && Callee->isDeclaration() && + !Callee->hasMetadata(LLVMContext::MD_callback)) + return true; + + if (!Callee->getReturnType()->isVoidTy() && !CS->use_empty()) { + + IRPosition CSRetPos = IRPosition::callsite_returned(CS); + + // Call site return values might be dead. + getOrCreateAAFor<AAIsDead>(CSRetPos); + + // Call site return integer values might be limited by a constant range. + if (Callee->getReturnType()->isIntegerTy()) { + getOrCreateAAFor<AAValueConstantRange>(CSRetPos); + } + } + + for (int i = 0, e = CS.getNumArgOperands(); i < e; i++) { + + IRPosition CSArgPos = IRPosition::callsite_argument(CS, i); + + // Every call site argument might be dead. + getOrCreateAAFor<AAIsDead>(CSArgPos); + + // Call site argument might be simplified. + getOrCreateAAFor<AAValueSimplify>(CSArgPos); + if (!CS.getArgument(i)->getType()->isPointerTy()) continue; // Call site argument attribute "non-null". - registerAA(*new AANonNullCallSiteArgument(CS, i, InfoCache), i); + getOrCreateAAFor<AANonNull>(CSArgPos); + + // Call site argument attribute "no-alias". + getOrCreateAAFor<AANoAlias>(CSArgPos); + + // Call site argument attribute "dereferenceable". + getOrCreateAAFor<AADereferenceable>(CSArgPos); + + // Call site argument attribute "align". + getOrCreateAAFor<AAAlign>(CSArgPos); + + // Call site argument attribute + // "readnone/readonly/writeonly/..." + getOrCreateAAFor<AAMemoryBehavior>(CSArgPos); + + // Call site argument attribute "nofree". + getOrCreateAAFor<AANoFree>(CSArgPos); } } - } + return true; + }; + + auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F); + bool Success, AnyDead = false; + Success = checkForAllInstructionsImpl( + OpcodeInstMap, CallSitePred, nullptr, AnyDead, + {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr, + (unsigned)Instruction::Call}); + (void)Success; + assert(Success && !AnyDead && "Expected the check call to be successful!"); + + auto LoadStorePred = [&](Instruction &I) -> bool { + if (isa<LoadInst>(I)) + getOrCreateAAFor<AAAlign>( + IRPosition::value(*cast<LoadInst>(I).getPointerOperand())); + else + getOrCreateAAFor<AAAlign>( + IRPosition::value(*cast<StoreInst>(I).getPointerOperand())); + return true; + }; + Success = checkForAllInstructionsImpl( + OpcodeInstMap, LoadStorePred, nullptr, AnyDead, + {(unsigned)Instruction::Load, (unsigned)Instruction::Store}); + (void)Success; + assert(Success && !AnyDead && "Expected the check call to be successful!"); } /// Helpers to ease debugging through output streams and print calls. @@ -1576,21 +6675,52 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, ChangeStatus S) { return OS << (S == ChangeStatus::CHANGED ? "changed" : "unchanged"); } -raw_ostream &llvm::operator<<(raw_ostream &OS, - AbstractAttribute::ManifestPosition AP) { +raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) { switch (AP) { - case AbstractAttribute::MP_ARGUMENT: + case IRPosition::IRP_INVALID: + return OS << "inv"; + case IRPosition::IRP_FLOAT: + return OS << "flt"; + case IRPosition::IRP_RETURNED: + return OS << "fn_ret"; + case IRPosition::IRP_CALL_SITE_RETURNED: + return OS << "cs_ret"; + case IRPosition::IRP_FUNCTION: + return OS << "fn"; + case IRPosition::IRP_CALL_SITE: + return OS << "cs"; + case IRPosition::IRP_ARGUMENT: return OS << "arg"; - case AbstractAttribute::MP_CALL_SITE_ARGUMENT: + case IRPosition::IRP_CALL_SITE_ARGUMENT: return OS << "cs_arg"; - case AbstractAttribute::MP_FUNCTION: - return OS << "fn"; - case AbstractAttribute::MP_RETURNED: - return OS << "fn_ret"; } llvm_unreachable("Unknown attribute position!"); } +raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) { + const Value &AV = Pos.getAssociatedValue(); + return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " [" + << Pos.getAnchorValue().getName() << "@" << Pos.getArgNo() << "]}"; +} + +template <typename base_ty, base_ty BestState, base_ty WorstState> +raw_ostream & +llvm::operator<<(raw_ostream &OS, + const IntegerStateBase<base_ty, BestState, WorstState> &S) { + return OS << "(" << S.getKnown() << "-" << S.getAssumed() << ")" + << static_cast<const AbstractState &>(S); +} + +raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) { + OS << "range-state(" << S.getBitWidth() << ")<"; + S.getKnown().print(OS); + OS << " / "; + S.getAssumed().print(OS); + OS << ">"; + + return OS << static_cast<const AbstractState &>(S); +} + raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractState &S) { return OS << (!S.isValidState() ? "top" : (S.isAtFixpoint() ? "fix" : "")); } @@ -1601,8 +6731,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) { } void AbstractAttribute::print(raw_ostream &OS) const { - OS << "[" << getManifestPosition() << "][" << getAsStr() << "][" - << AnchoredVal.getName() << "]"; + OS << "[P: " << getIRPosition() << "][" << getAsStr() << "][S: " << getState() + << "]"; } ///} @@ -1610,7 +6740,7 @@ void AbstractAttribute::print(raw_ostream &OS) const { /// Pass (Manager) Boilerplate /// ---------------------------------------------------------------------------- -static bool runAttributorOnModule(Module &M) { +static bool runAttributorOnModule(Module &M, AnalysisGetter &AG) { if (DisableAttributor) return false; @@ -1619,39 +6749,41 @@ static bool runAttributorOnModule(Module &M) { // Create an Attributor and initially empty information cache that is filled // while we identify default attribute opportunities. - Attributor A; - InformationCache InfoCache; + InformationCache InfoCache(M, AG); + Attributor A(InfoCache, DepRecInterval); + + for (Function &F : M) + A.initializeInformationCache(F); for (Function &F : M) { - // TODO: Not all attributes require an exact definition. Find a way to - // enable deduction for some but not all attributes in case the - // definition might be changed at runtime, see also - // http://lists.llvm.org/pipermail/llvm-dev/2018-February/121275.html. - // TODO: We could always determine abstract attributes and if sufficient - // information was found we could duplicate the functions that do not - // have an exact definition. - if (!F.hasExactDefinition()) { + if (F.hasExactDefinition()) + NumFnWithExactDefinition++; + else NumFnWithoutExactDefinition++; - continue; - } - - // For now we ignore naked and optnone functions. - if (F.hasFnAttribute(Attribute::Naked) || - F.hasFnAttribute(Attribute::OptimizeNone)) - continue; - NumFnWithExactDefinition++; + // We look at internal functions only on-demand but if any use is not a + // direct call, we have to do it eagerly. + if (F.hasLocalLinkage()) { + if (llvm::all_of(F.uses(), [](const Use &U) { + return ImmutableCallSite(U.getUser()) && + ImmutableCallSite(U.getUser()).isCallee(&U); + })) + continue; + } // Populate the Attributor with abstract attribute opportunities in the // function and the information cache with IR information. - A.identifyDefaultAbstractAttributes(F, InfoCache); + A.identifyDefaultAbstractAttributes(F); } - return A.run() == ChangeStatus::CHANGED; + bool Changed = A.run(M) == ChangeStatus::CHANGED; + assert(!verifyModule(M, &errs()) && "Module verification failed!"); + return Changed; } PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) { - if (runAttributorOnModule(M)) { + AnalysisGetter AG(AM); + if (runAttributorOnModule(M, AG)) { // FIXME: Think about passes we will preserve and add them here. return PreservedAnalyses::none(); } @@ -1670,12 +6802,14 @@ struct AttributorLegacyPass : public ModulePass { bool runOnModule(Module &M) override { if (skipModule(M)) return false; - return runAttributorOnModule(M); + + AnalysisGetter AG; + return runAttributorOnModule(M, AG); } void getAnalysisUsage(AnalysisUsage &AU) const override { // FIXME: Think about passes we will preserve and add them here. - AU.setPreservesCFG(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); } }; @@ -1684,7 +6818,153 @@ struct AttributorLegacyPass : public ModulePass { Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); } char AttributorLegacyPass::ID = 0; + +const char AAReturnedValues::ID = 0; +const char AANoUnwind::ID = 0; +const char AANoSync::ID = 0; +const char AANoFree::ID = 0; +const char AANonNull::ID = 0; +const char AANoRecurse::ID = 0; +const char AAWillReturn::ID = 0; +const char AAUndefinedBehavior::ID = 0; +const char AANoAlias::ID = 0; +const char AAReachability::ID = 0; +const char AANoReturn::ID = 0; +const char AAIsDead::ID = 0; +const char AADereferenceable::ID = 0; +const char AAAlign::ID = 0; +const char AANoCapture::ID = 0; +const char AAValueSimplify::ID = 0; +const char AAHeapToStack::ID = 0; +const char AAMemoryBehavior::ID = 0; +const char AAValueConstantRange::ID = 0; + +// Macro magic to create the static generator function for attributes that +// follow the naming scheme. + +#define SWITCH_PK_INV(CLASS, PK, POS_NAME) \ + case IRPosition::PK: \ + llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!"); + +#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX) \ + case IRPosition::PK: \ + AA = new CLASS##SUFFIX(IRP); \ + break; + +#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \ + CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \ + CLASS *AA = nullptr; \ + switch (IRP.getPositionKind()) { \ + SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \ + SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating") \ + SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument") \ + SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \ + SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned") \ + SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument") \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \ + } \ + return *AA; \ + } + +#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \ + CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \ + CLASS *AA = nullptr; \ + switch (IRP.getPositionKind()) { \ + SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \ + SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function") \ + SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site") \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \ + } \ + return *AA; \ + } + +#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \ + CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \ + CLASS *AA = nullptr; \ + switch (IRP.getPositionKind()) { \ + SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \ + } \ + return *AA; \ + } + +#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \ + CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \ + CLASS *AA = nullptr; \ + switch (IRP.getPositionKind()) { \ + SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \ + SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument") \ + SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating") \ + SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \ + SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned") \ + SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument") \ + SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site") \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \ + } \ + return *AA; \ + } + +#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \ + CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \ + CLASS *AA = nullptr; \ + switch (IRP.getPositionKind()) { \ + SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \ + SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \ + SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \ + } \ + return *AA; \ + } + +CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind) +CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync) +CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse) +CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn) +CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn) +CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues) + +CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull) +CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias) +CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable) +CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign) +CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture) +CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange) + +CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify) +CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead) +CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree) + +CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack) +CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability) +CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior) + +CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior) + +#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION +#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION +#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION +#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION +#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION +#undef SWITCH_PK_CREATE +#undef SWITCH_PK_INV + INITIALIZE_PASS_BEGIN(AttributorLegacyPass, "attributor", "Deduce and propagate attributes", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) INITIALIZE_PASS_END(AttributorLegacyPass, "attributor", "Deduce and propagate attributes", false, false) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp index 6b68aa90c567..b49a92ad16b3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/BarrierNoopPass.cpp @@ -17,6 +17,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp index 6c365f3f3cbe..aec470ffadc4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/BlockExtractor.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -119,6 +120,8 @@ void BlockExtractor::loadFile() { /*KeepEmpty=*/false); if (LineSplit.empty()) continue; + if (LineSplit.size()!=2) + report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'"); SmallVector<StringRef, 4> BBNames; LineSplit[1].split(BBNames, ';', /*MaxSplit=*/-1, /*KeepEmpty=*/false); @@ -204,7 +207,8 @@ bool BlockExtractor::runOnModule(Module &M) { ++NumExtracted; Changed = true; } - Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(); + CodeExtractorAnalysisCache CEAC(*BBs[0]->getParent()); + Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(CEAC); if (F) LLVM_DEBUG(dbgs() << "Extracted group '" << (*BBs.begin())->getName() << "' in: " << F->getName() << '\n'); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp index 20cb3213628e..f28a399b1779 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp @@ -21,6 +21,8 @@ #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp index ad877ae1786c..ea1278aa108f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ConstantMerge.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/IPO.h" @@ -48,7 +49,7 @@ static void FindUsedValues(GlobalVariable *LLVMUsed, ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer()); for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) { - Value *Operand = Inits->getOperand(i)->stripPointerCastsNoFollowAliases(); + Value *Operand = Inits->getOperand(i)->stripPointerCasts(); GlobalValue *GV = cast<GlobalValue>(Operand); UsedValues.insert(GV); } @@ -120,7 +121,7 @@ static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New) { // Bump the alignment if necessary. if (Old->getAlignment() || New->getAlignment()) - New->setAlignment(std::max(getAlignment(Old), getAlignment(New))); + New->setAlignment(Align(std::max(getAlignment(Old), getAlignment(New)))); copyDebugLocMetadata(Old, New); Old->replaceAllUsesWith(NewConstant); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp index e30b33aa4872..2fe9a59ad210 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -84,13 +85,9 @@ void CrossDSOCFI::buildCFICheck(Module &M) { for (GlobalObject &GO : M.global_objects()) { Types.clear(); GO.getMetadata(LLVMContext::MD_type, Types); - for (MDNode *Type : Types) { - // Sanity check. GO must not be a function declaration. - assert(!isa<Function>(&GO) || !cast<Function>(&GO)->isDeclaration()); - + for (MDNode *Type : Types) if (ConstantInt *TypeId = extractNumericTypeId(Type)) TypeIds.insert(TypeId->getZExtValue()); - } } NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions"); @@ -108,11 +105,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) { FunctionCallee C = M.getOrInsertFunction( "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx), Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx)); - Function *F = dyn_cast<Function>(C.getCallee()); + Function *F = cast<Function>(C.getCallee()); // Take over the existing function. The frontend emits a weak stub so that the // linker knows about the symbol; this pass replaces the function body. F->deleteBody(); - F->setAlignment(4096); + F->setAlignment(Align(4096)); Triple T(M.getTargetTriple()); if (T.isARM() || T.isThumb()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp index 968a13110b16..61d519d8ae88 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp index fc52db562c62..7f138d206fac 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/GlobalStatus.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp index b38cb6d0ed3f..2cb184e8d4f4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp @@ -11,6 +11,8 @@ #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 5ccd8bc4b0fb..b6d0b2e35694 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -48,6 +48,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -78,11 +79,8 @@ STATISTIC(NumNoRecurse, "Number of functions marked as norecurse"); STATISTIC(NumNoUnwind, "Number of functions marked as nounwind"); STATISTIC(NumNoFree, "Number of functions marked as nofree"); -// FIXME: This is disabled by default to avoid exposing security vulnerabilities -// in C/C++ code compiled by clang: -// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html static cl::opt<bool> EnableNonnullArgPropagation( - "enable-nonnull-arg-prop", cl::Hidden, + "enable-nonnull-arg-prop", cl::init(true), cl::Hidden, cl::desc("Try to propagate nonnull argument attributes from callsites to " "caller functions.")); @@ -664,6 +662,25 @@ static bool addArgumentAttrsFromCallsites(Function &F) { return Changed; } +static bool addReadAttr(Argument *A, Attribute::AttrKind R) { + assert((R == Attribute::ReadOnly || R == Attribute::ReadNone) + && "Must be a Read attribute."); + assert(A && "Argument must not be null."); + + // If the argument already has the attribute, nothing needs to be done. + if (A->hasAttribute(R)) + return false; + + // Otherwise, remove potentially conflicting attribute, add the new one, + // and update statistics. + A->removeAttr(Attribute::WriteOnly); + A->removeAttr(Attribute::ReadOnly); + A->removeAttr(Attribute::ReadNone); + A->addAttr(R); + R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; + return true; +} + /// Deduce nocapture attributes for the SCC. static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { bool Changed = false; @@ -732,11 +749,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { SmallPtrSet<Argument *, 8> Self; Self.insert(&*A); Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self); - if (R != Attribute::None) { - A->addAttr(R); - Changed = true; - R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; - } + if (R != Attribute::None) + Changed = addReadAttr(A, R); } } } @@ -833,12 +847,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) { if (ReadAttr != Attribute::None) { for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) { Argument *A = ArgumentSCC[i]->Definition; - // Clear out existing readonly/readnone attributes - A->removeAttr(Attribute::ReadOnly); - A->removeAttr(Attribute::ReadNone); - A->addAttr(ReadAttr); - ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg; - Changed = true; + Changed = addReadAttr(A, ReadAttr); } } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp index 62c7fbd07223..be0446a946ec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IRReader/IRReader.h" +#include "llvm/InitializePasses.h" #include "llvm/Linker/IRMover.h" #include "llvm/Object/ModuleSymbolTable.h" #include "llvm/Object/SymbolicFile.h" @@ -231,7 +232,8 @@ selectCallee(const ModuleSummaryIndex &Index, return false; } - if (Summary->instCount() > Threshold) { + if ((Summary->instCount() > Threshold) && + !Summary->fflags().AlwaysInline) { Reason = FunctionImporter::ImportFailureReason::TooLarge; return false; } @@ -280,7 +282,8 @@ updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) { } static void computeImportForReferencedGlobals( - const FunctionSummary &Summary, const GVSummaryMapTy &DefinedGVSummaries, + const FunctionSummary &Summary, const ModuleSummaryIndex &Index, + const GVSummaryMapTy &DefinedGVSummaries, FunctionImporter::ImportMapTy &ImportList, StringMap<FunctionImporter::ExportSetTy> *ExportLists) { for (auto &VI : Summary.refs()) { @@ -303,16 +306,28 @@ static void computeImportForReferencedGlobals( RefSummary->modulePath() != Summary.modulePath(); }; + auto MarkExported = [&](const ValueInfo &VI, const GlobalValueSummary *S) { + if (ExportLists) + (*ExportLists)[S->modulePath()].insert(VI); + }; + for (auto &RefSummary : VI.getSummaryList()) if (isa<GlobalVarSummary>(RefSummary.get()) && - canImportGlobalVar(RefSummary.get()) && + Index.canImportGlobalVar(RefSummary.get(), /* AnalyzeRefs */ true) && !LocalNotInModule(RefSummary.get())) { auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID()); // Only update stat if we haven't already imported this variable. if (ILI.second) NumImportedGlobalVarsThinLink++; - if (ExportLists) - (*ExportLists)[RefSummary->modulePath()].insert(VI.getGUID()); + MarkExported(VI, RefSummary.get()); + // Promote referenced functions and variables. We don't promote + // objects referenced by writeonly variable initializer, because + // we convert such variables initializers to "zeroinitializer". + // See processGlobalForThinLTO. + if (!Index.isWriteOnly(cast<GlobalVarSummary>(RefSummary.get()))) + for (const auto &VI : RefSummary->refs()) + for (const auto &RefFn : VI.getSummaryList()) + MarkExported(VI, RefFn.get()); break; } } @@ -351,8 +366,8 @@ static void computeImportForFunction( FunctionImporter::ImportMapTy &ImportList, StringMap<FunctionImporter::ExportSetTy> *ExportLists, FunctionImporter::ImportThresholdsTy &ImportThresholds) { - computeImportForReferencedGlobals(Summary, DefinedGVSummaries, ImportList, - ExportLists); + computeImportForReferencedGlobals(Summary, Index, DefinedGVSummaries, + ImportList, ExportLists); static int ImportCount = 0; for (auto &Edge : Summary.calls()) { ValueInfo VI = Edge.first; @@ -450,7 +465,7 @@ static void computeImportForFunction( } else if (PrintImportFailures) { assert(!FailureInfo && "Expected no FailureInfo for newly rejected candidate"); - FailureInfo = llvm::make_unique<FunctionImporter::ImportFailureInfo>( + FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>( VI, Edge.second.getHotness(), Reason, 1); } LLVM_DEBUG( @@ -462,7 +477,8 @@ static void computeImportForFunction( CalleeSummary = CalleeSummary->getBaseObject(); ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary); - assert(ResolvedCalleeSummary->instCount() <= NewThreshold && + assert((ResolvedCalleeSummary->fflags().AlwaysInline || + (ResolvedCalleeSummary->instCount() <= NewThreshold)) && "selectCallee() didn't honor the threshold"); auto ExportModulePath = ResolvedCalleeSummary->modulePath(); @@ -481,7 +497,7 @@ static void computeImportForFunction( // Make exports in the source module. if (ExportLists) { auto &ExportList = (*ExportLists)[ExportModulePath]; - ExportList.insert(VI.getGUID()); + ExportList.insert(VI); if (!PreviouslyImported) { // This is the first time this function was exported from its source // module, so mark all functions and globals it references as exported @@ -489,14 +505,11 @@ static void computeImportForFunction( // For efficiency, we unconditionally add all the referenced GUIDs // to the ExportList for this module, and will prune out any not // defined in the module later in a single pass. - for (auto &Edge : ResolvedCalleeSummary->calls()) { - auto CalleeGUID = Edge.first.getGUID(); - ExportList.insert(CalleeGUID); - } - for (auto &Ref : ResolvedCalleeSummary->refs()) { - auto GUID = Ref.getGUID(); - ExportList.insert(GUID); - } + for (auto &Edge : ResolvedCalleeSummary->calls()) + ExportList.insert(Edge.first); + + for (auto &Ref : ResolvedCalleeSummary->refs()) + ExportList.insert(Ref); } } } @@ -591,29 +604,64 @@ static void ComputeImportForModule( } #ifndef NDEBUG +static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, ValueInfo VI) { + auto SL = VI.getSummaryList(); + return SL.empty() + ? false + : SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind; +} + static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, GlobalValue::GUID G) { - if (const auto &VI = Index.getValueInfo(G)) { - auto SL = VI.getSummaryList(); - if (!SL.empty()) - return SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind; - } + if (const auto &VI = Index.getValueInfo(G)) + return isGlobalVarSummary(Index, VI); return false; } -static GlobalValue::GUID getGUID(GlobalValue::GUID G) { return G; } - template <class T> static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index, T &Cont) { unsigned NumGVS = 0; for (auto &V : Cont) - if (isGlobalVarSummary(Index, getGUID(V))) + if (isGlobalVarSummary(Index, V)) ++NumGVS; return NumGVS; } #endif +#ifndef NDEBUG +static bool +checkVariableImport(const ModuleSummaryIndex &Index, + StringMap<FunctionImporter::ImportMapTy> &ImportLists, + StringMap<FunctionImporter::ExportSetTy> &ExportLists) { + + DenseSet<GlobalValue::GUID> FlattenedImports; + + for (auto &ImportPerModule : ImportLists) + for (auto &ExportPerModule : ImportPerModule.second) + FlattenedImports.insert(ExportPerModule.second.begin(), + ExportPerModule.second.end()); + + // Checks that all GUIDs of read/writeonly vars we see in export lists + // are also in the import lists. Otherwise we my face linker undefs, + // because readonly and writeonly vars are internalized in their + // source modules. + auto IsReadOrWriteOnlyVar = [&](StringRef ModulePath, const ValueInfo &VI) { + auto *GVS = dyn_cast_or_null<GlobalVarSummary>( + Index.findSummaryInModule(VI, ModulePath)); + return GVS && (Index.isReadOnly(GVS) || Index.isWriteOnly(GVS)); + }; + + for (auto &ExportPerModule : ExportLists) + for (auto &VI : ExportPerModule.second) + if (!FlattenedImports.count(VI.getGUID()) && + IsReadOrWriteOnlyVar(ExportPerModule.first(), VI)) + return false; + + return true; +} +#endif + /// Compute all the import and export for every module using the Index. void llvm::ComputeCrossModuleImport( const ModuleSummaryIndex &Index, @@ -639,13 +687,14 @@ void llvm::ComputeCrossModuleImport( const auto &DefinedGVSummaries = ModuleToDefinedGVSummaries.lookup(ELI.first()); for (auto EI = ELI.second.begin(); EI != ELI.second.end();) { - if (!DefinedGVSummaries.count(*EI)) - EI = ELI.second.erase(EI); + if (!DefinedGVSummaries.count(EI->getGUID())) + ELI.second.erase(EI++); else ++EI; } } + assert(checkVariableImport(Index, ImportLists, ExportLists)); #ifndef NDEBUG LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size() << " modules:\n"); @@ -764,7 +813,7 @@ void llvm::computeDeadSymbols( } // Make value live and add it to the worklist if it was not live before. - auto visit = [&](ValueInfo VI) { + auto visit = [&](ValueInfo VI, bool IsAliasee) { // FIXME: If we knew which edges were created for indirect call profiles, // we could skip them here. Any that are live should be reached via // other edges, e.g. reference edges. Otherwise, using a profile collected @@ -800,12 +849,15 @@ void llvm::computeDeadSymbols( Interposable = true; } - if (!KeepAliveLinkage) - return; + if (!IsAliasee) { + if (!KeepAliveLinkage) + return; - if (Interposable) - report_fatal_error( - "Interposable and available_externally/linkonce_odr/weak_odr symbol"); + if (Interposable) + report_fatal_error( + "Interposable and available_externally/linkonce_odr/weak_odr " + "symbol"); + } } for (auto &S : VI.getSummaryList()) @@ -821,16 +873,16 @@ void llvm::computeDeadSymbols( // If this is an alias, visit the aliasee VI to ensure that all copies // are marked live and it is added to the worklist for further // processing of its references. - visit(AS->getAliaseeVI()); + visit(AS->getAliaseeVI(), true); continue; } Summary->setLive(true); for (auto Ref : Summary->refs()) - visit(Ref); + visit(Ref, false); if (auto *FS = dyn_cast<FunctionSummary>(Summary.get())) for (auto Call : FS->calls()) - visit(Call.first); + visit(Call.first, false); } } Index.setWithGlobalValueDeadStripping(); @@ -849,18 +901,8 @@ void llvm::computeDeadSymbolsWithConstProp( function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing, bool ImportEnabled) { computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing); - if (ImportEnabled) { + if (ImportEnabled) Index.propagateAttributes(GUIDPreservedSymbols); - } else { - // If import is disabled we should drop read/write-only attribute - // from all summaries to prevent internalization. - for (auto &P : Index) - for (auto &S : P.second.SummaryList) - if (auto *GVS = dyn_cast<GlobalVarSummary>(S.get())) { - GVS->setReadOnly(false); - GVS->setWriteOnly(false); - } - } } /// Compute the set of summaries needed for a ThinLTO backend compilation of @@ -892,7 +934,7 @@ std::error_code llvm::EmitImportsFiles( StringRef ModulePath, StringRef OutputFilename, const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) { std::error_code EC; - raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::F_None); + raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_None); if (EC) return EC; for (auto &ILI : ModuleToSummariesForIndex) @@ -948,23 +990,15 @@ void llvm::thinLTOResolvePrevailingInModule( auto NewLinkage = GS->second->linkage(); if (NewLinkage == GV.getLinkage()) return; - - // Switch the linkage to weakany if asked for, e.g. we do this for - // linker redefined symbols (via --wrap or --defsym). - // We record that the visibility should be changed here in `addThinLTO` - // as we need access to the resolution vectors for each input file in - // order to find which symbols have been redefined. - // We may consider reorganizing this code and moving the linkage recording - // somewhere else, e.g. in thinLTOResolvePrevailingInIndex. - if (NewLinkage == GlobalValue::WeakAnyLinkage) { - GV.setLinkage(NewLinkage); - return; - } - if (GlobalValue::isLocalLinkage(GV.getLinkage()) || + // Don't internalize anything here, because the code below + // lacks necessary correctness checks. Leave this job to + // LLVM 'internalize' pass. + GlobalValue::isLocalLinkage(NewLinkage) || // In case it was dead and already converted to declaration. GV.isDeclaration()) return; + // Check for a non-prevailing def that has interposable linkage // (e.g. non-odr weak or linkonce). In that case we can't simply // convert to available_externally, since it would lose the diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp index 86b7f3e49ee6..72b8d7522f04 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalDCE.cpp @@ -17,10 +17,14 @@ #include "llvm/Transforms/IPO/GlobalDCE.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/GlobalStatus.h" @@ -29,10 +33,15 @@ using namespace llvm; #define DEBUG_TYPE "globaldce" +static cl::opt<bool> + ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::ZeroOrMore, + cl::desc("Enable virtual function elimination")); + STATISTIC(NumAliases , "Number of global aliases removed"); STATISTIC(NumFunctions, "Number of functions removed"); STATISTIC(NumIFuncs, "Number of indirect functions removed"); STATISTIC(NumVariables, "Number of global variables removed"); +STATISTIC(NumVFuncs, "Number of virtual functions removed"); namespace { class GlobalDCELegacyPass : public ModulePass { @@ -118,6 +127,15 @@ void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) { ComputeDependencies(User, Deps); Deps.erase(&GV); // Remove self-reference. for (GlobalValue *GVU : Deps) { + // If this is a dep from a vtable to a virtual function, and we have + // complete information about all virtual call sites which could call + // though this vtable, then skip it, because the call site information will + // be more precise. + if (VFESafeVTables.count(GVU) && isa<Function>(&GV)) { + LLVM_DEBUG(dbgs() << "Ignoring dep " << GVU->getName() << " -> " + << GV.getName() << "\n"); + continue; + } GVDependencies[GVU].insert(&GV); } } @@ -132,12 +150,133 @@ void GlobalDCEPass::MarkLive(GlobalValue &GV, if (Updates) Updates->push_back(&GV); if (Comdat *C = GV.getComdat()) { - for (auto &&CM : make_range(ComdatMembers.equal_range(C))) + for (auto &&CM : make_range(ComdatMembers.equal_range(C))) { MarkLive(*CM.second, Updates); // Recursion depth is only two because only // globals in the same comdat are visited. + } + } +} + +void GlobalDCEPass::ScanVTables(Module &M) { + SmallVector<MDNode *, 2> Types; + LLVM_DEBUG(dbgs() << "Building type info -> vtable map\n"); + + auto *LTOPostLinkMD = + cast_or_null<ConstantAsMetadata>(M.getModuleFlag("LTOPostLink")); + bool LTOPostLink = + LTOPostLinkMD && + (cast<ConstantInt>(LTOPostLinkMD->getValue())->getZExtValue() != 0); + + for (GlobalVariable &GV : M.globals()) { + Types.clear(); + GV.getMetadata(LLVMContext::MD_type, Types); + if (GV.isDeclaration() || Types.empty()) + continue; + + // Use the typeid metadata on the vtable to build a mapping from typeids to + // the list of (GV, offset) pairs which are the possible vtables for that + // typeid. + for (MDNode *Type : Types) { + Metadata *TypeID = Type->getOperand(1).get(); + + uint64_t Offset = + cast<ConstantInt>( + cast<ConstantAsMetadata>(Type->getOperand(0))->getValue()) + ->getZExtValue(); + + TypeIdMap[TypeID].insert(std::make_pair(&GV, Offset)); + } + + // If the type corresponding to the vtable is private to this translation + // unit, we know that we can see all virtual functions which might use it, + // so VFE is safe. + if (auto GO = dyn_cast<GlobalObject>(&GV)) { + GlobalObject::VCallVisibility TypeVis = GO->getVCallVisibility(); + if (TypeVis == GlobalObject::VCallVisibilityTranslationUnit || + (LTOPostLink && + TypeVis == GlobalObject::VCallVisibilityLinkageUnit)) { + LLVM_DEBUG(dbgs() << GV.getName() << " is safe for VFE\n"); + VFESafeVTables.insert(&GV); + } + } + } +} + +void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId, + uint64_t CallOffset) { + for (auto &VTableInfo : TypeIdMap[TypeId]) { + GlobalVariable *VTable = VTableInfo.first; + uint64_t VTableOffset = VTableInfo.second; + + Constant *Ptr = + getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset, + *Caller->getParent()); + if (!Ptr) { + LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n"); + VFESafeVTables.erase(VTable); + return; + } + + auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts()); + if (!Callee) { + LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n"); + VFESafeVTables.erase(VTable); + return; + } + + LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> " + << Callee->getName() << "\n"); + GVDependencies[Caller].insert(Callee); } } +void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) { + LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n"); + Function *TypeCheckedLoadFunc = + M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load)); + + if (!TypeCheckedLoadFunc) + return; + + for (auto U : TypeCheckedLoadFunc->users()) { + auto CI = dyn_cast<CallInst>(U); + if (!CI) + continue; + + auto *Offset = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + Value *TypeIdValue = CI->getArgOperand(2); + auto *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata(); + + if (Offset) { + ScanVTableLoad(CI->getFunction(), TypeId, Offset->getZExtValue()); + } else { + // type.checked.load with a non-constant offset, so assume every entry in + // every matching vtable is used. + for (auto &VTableInfo : TypeIdMap[TypeId]) { + VFESafeVTables.erase(VTableInfo.first); + } + } + } +} + +void GlobalDCEPass::AddVirtualFunctionDependencies(Module &M) { + if (!ClEnableVFE) + return; + + ScanVTables(M); + + if (VFESafeVTables.empty()) + return; + + ScanTypeCheckedLoadIntrinsics(M); + + LLVM_DEBUG( + dbgs() << "VFE safe vtables:\n"; + for (auto *VTable : VFESafeVTables) + dbgs() << " " << VTable->getName() << "\n"; + ); +} + PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { bool Changed = false; @@ -163,6 +302,10 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { if (Comdat *C = GA.getComdat()) ComdatMembers.insert(std::make_pair(C, &GA)); + // Add dependencies between virtual call sites and the virtual functions they + // might call, if we have that information. + AddVirtualFunctionDependencies(M); + // Loop over the module, adding globals which are obviously necessary. for (GlobalObject &GO : M.global_objects()) { Changed |= RemoveUnusedGlobalValue(GO); @@ -257,8 +400,17 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { }; NumFunctions += DeadFunctions.size(); - for (Function *F : DeadFunctions) + for (Function *F : DeadFunctions) { + if (!F->use_empty()) { + // Virtual functions might still be referenced by one or more vtables, + // but if we've proven them to be unused then it's safe to replace the + // virtual function pointers with null, allowing us to remove the + // function itself. + ++NumVFuncs; + F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType())); + } EraseUnusedGlobalValue(F); + } NumVariables += DeadGlobalVars.size(); for (GlobalVariable *GV : DeadGlobalVars) @@ -277,6 +429,8 @@ PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) { ConstantDependenciesCache.clear(); GVDependencies.clear(); ComdatMembers.clear(); + TypeIdMap.clear(); + VFESafeVTables.clear(); if (Changed) return PreservedAnalyses::none(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp index c4fb3ce77f6e..0fd966457ece 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -25,7 +25,6 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" @@ -53,6 +52,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" @@ -65,6 +65,7 @@ #include "llvm/Transforms/Utils/CtorUtils.h" #include "llvm/Transforms/Utils/Evaluator.h" #include "llvm/Transforms/Utils/GlobalStatus.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <utility> @@ -155,7 +156,8 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) { /// Given a value that is stored to a global but never read, determine whether /// it's safe to remove the store and the chain of computation that feeds the /// store. -static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) { +static bool IsSafeComputationToRemove( + Value *V, function_ref<TargetLibraryInfo &(Function &)> GetTLI) { do { if (isa<Constant>(V)) return true; @@ -164,7 +166,7 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) { if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) || isa<GlobalValue>(V)) return false; - if (isAllocationFn(V, TLI)) + if (isAllocationFn(V, GetTLI)) return true; Instruction *I = cast<Instruction>(V); @@ -184,8 +186,9 @@ static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) { /// This GV is a pointer root. Loop over all users of the global and clean up /// any that obviously don't assign the global a value that isn't dynamically /// allocated. -static bool CleanupPointerRootUsers(GlobalVariable *GV, - const TargetLibraryInfo *TLI) { +static bool +CleanupPointerRootUsers(GlobalVariable *GV, + function_ref<TargetLibraryInfo &(Function &)> GetTLI) { // A brief explanation of leak checkers. The goal is to find bugs where // pointers are forgotten, causing an accumulating growth in memory // usage over time. The common strategy for leak checkers is to whitelist the @@ -241,18 +244,18 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, C->destroyConstant(); // This could have invalidated UI, start over from scratch. Dead.clear(); - CleanupPointerRootUsers(GV, TLI); + CleanupPointerRootUsers(GV, GetTLI); return true; } } } for (int i = 0, e = Dead.size(); i != e; ++i) { - if (IsSafeComputationToRemove(Dead[i].first, TLI)) { + if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) { Dead[i].second->eraseFromParent(); Instruction *I = Dead[i].first; do { - if (isAllocationFn(I, TLI)) + if (isAllocationFn(I, GetTLI)) break; Instruction *J = dyn_cast<Instruction>(I->getOperand(0)); if (!J) @@ -270,9 +273,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV, /// We just marked GV constant. Loop over all users of the global, cleaning up /// the obvious ones. This is largely just a quick scan over the use list to /// clean up the easy and obvious cruft. This returns true if it made a change. -static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, - const DataLayout &DL, - TargetLibraryInfo *TLI) { +static bool CleanupConstantGlobalUsers( + Value *V, Constant *Init, const DataLayout &DL, + function_ref<TargetLibraryInfo &(Function &)> GetTLI) { bool Changed = false; // Note that we need to use a weak value handle for the worklist items. When // we delete a constant array, we may also be holding pointer to one of its @@ -302,12 +305,12 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, Constant *SubInit = nullptr; if (Init) SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); - Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, TLI); + Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI); } else if ((CE->getOpcode() == Instruction::BitCast && CE->getType()->isPointerTy()) || CE->getOpcode() == Instruction::AddrSpaceCast) { // Pointer cast, delete any stores and memsets to the global. - Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, TLI); + Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI); } if (CE->use_empty()) { @@ -321,7 +324,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, Constant *SubInit = nullptr; if (!isa<ConstantExpr>(GEP->getOperand(0))) { ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>( - ConstantFoldInstruction(GEP, DL, TLI)); + ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction()))); if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr) SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE); @@ -331,7 +334,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds()) SubInit = Constant::getNullValue(GEP->getResultElementType()); } - Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, TLI); + Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI); if (GEP->use_empty()) { GEP->eraseFromParent(); @@ -348,7 +351,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init, // us, and if they are all dead, nuke them without remorse. if (isSafeToDestroyConstant(C)) { C->destroyConstant(); - CleanupConstantGlobalUsers(V, Init, DL, TLI); + CleanupConstantGlobalUsers(V, Init, DL, GetTLI); return true; } } @@ -430,6 +433,20 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) { return true; } +static bool CanDoGlobalSRA(GlobalVariable *GV) { + Constant *Init = GV->getInitializer(); + + if (isa<StructType>(Init->getType())) { + // nothing to check + } else if (SequentialType *STy = dyn_cast<SequentialType>(Init->getType())) { + if (STy->getNumElements() > 16 && GV->hasNUsesOrMore(16)) + return false; // It's not worth it. + } else + return false; + + return GlobalUsersSafeToSRA(GV); +} + /// Copy over the debug info for a variable to its SRA replacements. static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, uint64_t FragmentOffsetInBits, @@ -459,88 +476,94 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV, /// insert so that the caller can reprocess it. static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Make sure this global only has simple uses that we can SRA. - if (!GlobalUsersSafeToSRA(GV)) + if (!CanDoGlobalSRA(GV)) return nullptr; assert(GV->hasLocalLinkage()); Constant *Init = GV->getInitializer(); Type *Ty = Init->getType(); - std::vector<GlobalVariable *> NewGlobals; - Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); + std::map<unsigned, GlobalVariable *> NewGlobals; // Get the alignment of the global, either explicit or target-specific. unsigned StartAlignment = GV->getAlignment(); if (StartAlignment == 0) StartAlignment = DL.getABITypeAlignment(GV->getType()); - if (StructType *STy = dyn_cast<StructType>(Ty)) { - unsigned NumElements = STy->getNumElements(); - NewGlobals.reserve(NumElements); - const StructLayout &Layout = *DL.getStructLayout(STy); - for (unsigned i = 0, e = NumElements; i != e; ++i) { - Constant *In = Init->getAggregateElement(i); - assert(In && "Couldn't get element of initializer?"); - GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false, - GlobalVariable::InternalLinkage, - In, GV->getName()+"."+Twine(i), - GV->getThreadLocalMode(), - GV->getType()->getAddressSpace()); - NGV->setExternallyInitialized(GV->isExternallyInitialized()); - NGV->copyAttributesFrom(GV); - Globals.push_back(NGV); - NewGlobals.push_back(NGV); + // Loop over all users and create replacement variables for used aggregate + // elements. + for (User *GEP : GV->users()) { + assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() == + Instruction::GetElementPtr) || + isa<GetElementPtrInst>(GEP)) && + "NonGEP CE's are not SRAable!"); + + // Ignore the 1th operand, which has to be zero or else the program is quite + // broken (undefined). Get the 2nd operand, which is the structure or array + // index. + unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); + if (NewGlobals.count(ElementIdx) == 1) + continue; // we`ve already created replacement variable + assert(NewGlobals.count(ElementIdx) == 0); + + Type *ElTy = nullptr; + if (StructType *STy = dyn_cast<StructType>(Ty)) + ElTy = STy->getElementType(ElementIdx); + else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) + ElTy = STy->getElementType(); + assert(ElTy); + + Constant *In = Init->getAggregateElement(ElementIdx); + assert(In && "Couldn't get element of initializer?"); + + GlobalVariable *NGV = new GlobalVariable( + ElTy, false, GlobalVariable::InternalLinkage, In, + GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(), + GV->getType()->getAddressSpace()); + NGV->setExternallyInitialized(GV->isExternallyInitialized()); + NGV->copyAttributesFrom(GV); + NewGlobals.insert(std::make_pair(ElementIdx, NGV)); + + if (StructType *STy = dyn_cast<StructType>(Ty)) { + const StructLayout &Layout = *DL.getStructLayout(STy); // Calculate the known alignment of the field. If the original aggregate // had 256 byte alignment for example, something might depend on that: // propagate info to each field. - uint64_t FieldOffset = Layout.getElementOffset(i); - unsigned NewAlign = (unsigned)MinAlign(StartAlignment, FieldOffset); - if (NewAlign > DL.getABITypeAlignment(STy->getElementType(i))) + uint64_t FieldOffset = Layout.getElementOffset(ElementIdx); + Align NewAlign(MinAlign(StartAlignment, FieldOffset)); + if (NewAlign > + Align(DL.getABITypeAlignment(STy->getElementType(ElementIdx)))) NGV->setAlignment(NewAlign); // Copy over the debug info for the variable. uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType()); - uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(i); - transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, NumElements); - } - } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) { - unsigned NumElements = STy->getNumElements(); - if (NumElements > 16 && GV->hasNUsesOrMore(16)) - return nullptr; // It's not worth it. - NewGlobals.reserve(NumElements); - auto ElTy = STy->getElementType(); - uint64_t EltSize = DL.getTypeAllocSize(ElTy); - unsigned EltAlign = DL.getABITypeAlignment(ElTy); - uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy); - for (unsigned i = 0, e = NumElements; i != e; ++i) { - Constant *In = Init->getAggregateElement(i); - assert(In && "Couldn't get element of initializer?"); - - GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false, - GlobalVariable::InternalLinkage, - In, GV->getName()+"."+Twine(i), - GV->getThreadLocalMode(), - GV->getType()->getAddressSpace()); - NGV->setExternallyInitialized(GV->isExternallyInitialized()); - NGV->copyAttributesFrom(GV); - Globals.push_back(NGV); - NewGlobals.push_back(NGV); + uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx); + transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, + STy->getNumElements()); + } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) { + uint64_t EltSize = DL.getTypeAllocSize(ElTy); + Align EltAlign(DL.getABITypeAlignment(ElTy)); + uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy); // Calculate the known alignment of the field. If the original aggregate // had 256 byte alignment for example, something might depend on that: // propagate info to each field. - unsigned NewAlign = (unsigned)MinAlign(StartAlignment, EltSize*i); + Align NewAlign(MinAlign(StartAlignment, EltSize * ElementIdx)); if (NewAlign > EltAlign) NGV->setAlignment(NewAlign); - transferSRADebugInfo(GV, NGV, FragmentSizeInBits * i, FragmentSizeInBits, - NumElements); + transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx, + FragmentSizeInBits, STy->getNumElements()); } } if (NewGlobals.empty()) return nullptr; + Module::GlobalListType &Globals = GV->getParent()->getGlobalList(); + for (auto NewGlobalVar : NewGlobals) + Globals.push_back(NewGlobalVar.second); + LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n"); Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext())); @@ -556,11 +579,11 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { // Ignore the 1th operand, which has to be zero or else the program is quite // broken (undefined). Get the 2nd operand, which is the structure or array // index. - unsigned Val = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); - if (Val >= NewGlobals.size()) Val = 0; // Out of bound array access. + unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue(); + assert(NewGlobals.count(ElementIdx) == 1); - Value *NewPtr = NewGlobals[Val]; - Type *NewTy = NewGlobals[Val]->getValueType(); + Value *NewPtr = NewGlobals[ElementIdx]; + Type *NewTy = NewGlobals[ElementIdx]->getValueType(); // Form a shorter GEP if needed. if (GEP->getNumOperands() > 3) { @@ -578,7 +601,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) Idxs.push_back(GEPI->getOperand(i)); NewPtr = GetElementPtrInst::Create( - NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(Val), GEPI); + NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx), + GEPI); } } GEP->replaceAllUsesWith(NewPtr); @@ -593,17 +617,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) { Globals.erase(GV); ++NumSRA; - // Loop over the new globals array deleting any globals that are obviously - // dead. This can arise due to scalarization of a structure or an array that - // has elements that are dead. - unsigned FirstGlobal = 0; - for (unsigned i = 0, e = NewGlobals.size(); i != e; ++i) - if (NewGlobals[i]->use_empty()) { - Globals.erase(NewGlobals[i]); - if (FirstGlobal == i) ++FirstGlobal; - } - - return FirstGlobal != NewGlobals.size() ? NewGlobals[FirstGlobal] : nullptr; + assert(NewGlobals.size() > 0); + return NewGlobals.begin()->second; } /// Return true if all users of the specified value will trap if the value is @@ -745,9 +760,9 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) { /// are uses of the loaded value that would trap if the loaded value is /// dynamically null, then we know that they cannot be reachable with a null /// optimize away the load. -static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, - const DataLayout &DL, - TargetLibraryInfo *TLI) { +static bool OptimizeAwayTrappingUsesOfLoads( + GlobalVariable *GV, Constant *LV, const DataLayout &DL, + function_ref<TargetLibraryInfo &(Function &)> GetTLI) { bool Changed = false; // Keep track of whether we are able to remove all the uses of the global @@ -793,10 +808,10 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, // nor is the global. if (AllNonStoreUsesGone) { if (isLeakCheckerRoot(GV)) { - Changed |= CleanupPointerRootUsers(GV, TLI); + Changed |= CleanupPointerRootUsers(GV, GetTLI); } else { Changed = true; - CleanupConstantGlobalUsers(GV, nullptr, DL, TLI); + CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI); } if (GV->use_empty()) { LLVM_DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); @@ -889,8 +904,8 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, while (!GV->use_empty()) { if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) { // The global is initialized when the store to it occurs. - new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, 0, - SI->getOrdering(), SI->getSyncScopeID(), SI); + new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false, + None, SI->getOrdering(), SI->getSyncScopeID(), SI); SI->eraseFromParent(); continue; } @@ -907,7 +922,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy, // Replace the cmp X, 0 with a use of the bool value. // Sink the load to where the compare was, if atomic rules allow us to. Value *LV = new LoadInst(InitBool->getValueType(), InitBool, - InitBool->getName() + ".val", false, 0, + InitBool->getName() + ".val", false, None, LI->getOrdering(), LI->getSyncScopeID(), LI->isUnordered() ? (Instruction *)ICI : LI); InitBoolUsed = true; @@ -1562,10 +1577,10 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI, // Try to optimize globals based on the knowledge that only one value (besides // its initializer) is ever stored to the global. -static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, - AtomicOrdering Ordering, - const DataLayout &DL, - TargetLibraryInfo *TLI) { +static bool +optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, + AtomicOrdering Ordering, const DataLayout &DL, + function_ref<TargetLibraryInfo &(Function &)> GetTLI) { // Ignore no-op GEPs and bitcasts. StoredOnceVal = StoredOnceVal->stripPointerCasts(); @@ -1583,9 +1598,10 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType()); // Optimize away any trapping uses of the loaded value. - if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, TLI)) + if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI)) return true; - } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) { + } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) { + auto *TLI = &GetTLI(*CI->getFunction()); Type *MallocType = getMallocAllocatedType(CI, TLI); if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, DL, TLI)) @@ -1643,10 +1659,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { // instead of a select to synthesize the desired value. bool IsOneZero = false; bool EmitOneOrZero = true; - if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)){ + auto *CI = dyn_cast<ConstantInt>(OtherVal); + if (CI && CI->getValue().getActiveBits() <= 64) { IsOneZero = InitVal->isNullValue() && CI->isOne(); - if (ConstantInt *CIInit = dyn_cast<ConstantInt>(GV->getInitializer())){ + auto *CIInit = dyn_cast<ConstantInt>(GV->getInitializer()); + if (CIInit && CIInit->getValue().getActiveBits() <= 64) { uint64_t ValInit = CIInit->getZExtValue(); uint64_t ValOther = CI->getZExtValue(); uint64_t ValMinus = ValOther - ValInit; @@ -1711,7 +1729,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { assert(LI->getOperand(0) == GV && "Not a copy!"); // Insert a new load, to preserve the saved value. StoreVal = new LoadInst(NewGV->getValueType(), NewGV, - LI->getName() + ".b", false, 0, + LI->getName() + ".b", false, None, LI->getOrdering(), LI->getSyncScopeID(), LI); } else { assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) && @@ -1721,15 +1739,15 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { } } StoreInst *NSI = - new StoreInst(StoreVal, NewGV, false, 0, SI->getOrdering(), + new StoreInst(StoreVal, NewGV, false, None, SI->getOrdering(), SI->getSyncScopeID(), SI); NSI->setDebugLoc(SI->getDebugLoc()); } else { // Change the load into a load of bool then a select. LoadInst *LI = cast<LoadInst>(UI); - LoadInst *NLI = - new LoadInst(NewGV->getValueType(), NewGV, LI->getName() + ".b", - false, 0, LI->getOrdering(), LI->getSyncScopeID(), LI); + LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV, + LI->getName() + ".b", false, None, + LI->getOrdering(), LI->getSyncScopeID(), LI); Instruction *NSI; if (IsOneZero) NSI = new ZExtInst(NLI, LI->getType(), "", LI); @@ -1914,9 +1932,10 @@ static void makeAllConstantUsesInstructions(Constant *C) { /// Analyze the specified global variable and optimize /// it if possible. If we make a change, return true. -static bool processInternalGlobal( - GlobalVariable *GV, const GlobalStatus &GS, TargetLibraryInfo *TLI, - function_ref<DominatorTree &(Function &)> LookupDomTree) { +static bool +processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, + function_ref<TargetLibraryInfo &(Function &)> GetTLI, + function_ref<DominatorTree &(Function &)> LookupDomTree) { auto &DL = GV->getParent()->getDataLayout(); // If this is a first class global and has only one accessing function and // this function is non-recursive, we replace the global with a local alloca @@ -1963,11 +1982,12 @@ static bool processInternalGlobal( bool Changed; if (isLeakCheckerRoot(GV)) { // Delete any constant stores to the global. - Changed = CleanupPointerRootUsers(GV, TLI); + Changed = CleanupPointerRootUsers(GV, GetTLI); } else { // Delete any stores we can find to the global. We may not be able to // make it completely dead though. - Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); + Changed = + CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); } // If the global is dead now, delete it. @@ -1989,7 +2009,7 @@ static bool processInternalGlobal( GV->setConstant(true); // Clean up any obviously simplifiable users now. - CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); + CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); // If the global is dead now, just nuke it. if (GV->use_empty()) { @@ -2019,7 +2039,7 @@ static bool processInternalGlobal( GV->setInitializer(SOVConstant); // Clean up any obviously simplifiable users now. - CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, TLI); + CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI); if (GV->use_empty()) { LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to " @@ -2033,7 +2053,8 @@ static bool processInternalGlobal( // Try to optimize globals based on the knowledge that only one value // (besides its initializer) is ever stored to the global. - if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, TLI)) + if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL, + GetTLI)) return true; // Otherwise, if the global was not a boolean, we can shrink it to be a @@ -2054,7 +2075,8 @@ static bool processInternalGlobal( /// Analyze the specified global variable and optimize it if possible. If we /// make a change, return true. static bool -processGlobal(GlobalValue &GV, TargetLibraryInfo *TLI, +processGlobal(GlobalValue &GV, + function_ref<TargetLibraryInfo &(Function &)> GetTLI, function_ref<DominatorTree &(Function &)> LookupDomTree) { if (GV.getName().startswith("llvm.")) return false; @@ -2086,7 +2108,7 @@ processGlobal(GlobalValue &GV, TargetLibraryInfo *TLI, if (GVar->isConstant() || !GVar->hasInitializer()) return Changed; - return processInternalGlobal(GVar, GS, TLI, LookupDomTree) || Changed; + return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed; } /// Walk all of the direct calls of the specified function, changing them to @@ -2234,7 +2256,8 @@ hasOnlyColdCalls(Function &F, } static bool -OptimizeFunctions(Module &M, TargetLibraryInfo *TLI, +OptimizeFunctions(Module &M, + function_ref<TargetLibraryInfo &(Function &)> GetTLI, function_ref<TargetTransformInfo &(Function &)> GetTTI, function_ref<BlockFrequencyInfo &(Function &)> GetBFI, function_ref<DominatorTree &(Function &)> LookupDomTree, @@ -2285,7 +2308,7 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI, } } - Changed |= processGlobal(*F, TLI, LookupDomTree); + Changed |= processGlobal(*F, GetTLI, LookupDomTree); if (!F->hasLocalLinkage()) continue; @@ -2342,7 +2365,8 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI, } static bool -OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI, +OptimizeGlobalVars(Module &M, + function_ref<TargetLibraryInfo &(Function &)> GetTLI, function_ref<DominatorTree &(Function &)> LookupDomTree, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) { bool Changed = false; @@ -2357,7 +2381,10 @@ OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI, if (GV->hasInitializer()) if (auto *C = dyn_cast<Constant>(GV->getInitializer())) { auto &DL = M.getDataLayout(); - Constant *New = ConstantFoldConstant(C, DL, TLI); + // TLI is not used in the case of a Constant, so use default nullptr + // for that optional parameter, since we don't have a Function to + // provide GetTLI anyway. + Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr); if (New && New != C) GV->setInitializer(New); } @@ -2367,7 +2394,7 @@ OptimizeGlobalVars(Module &M, TargetLibraryInfo *TLI, continue; } - Changed |= processGlobal(*GV, TLI, LookupDomTree); + Changed |= processGlobal(*GV, GetTLI, LookupDomTree); } return Changed; } @@ -2581,8 +2608,8 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL, } static int compareNames(Constant *const *A, Constant *const *B) { - Value *AStripped = (*A)->stripPointerCastsNoFollowAliases(); - Value *BStripped = (*B)->stripPointerCastsNoFollowAliases(); + Value *AStripped = (*A)->stripPointerCasts(); + Value *BStripped = (*B)->stripPointerCasts(); return AStripped->getName().compare(BStripped->getName()); } @@ -2809,7 +2836,14 @@ OptimizeGlobalAliases(Module &M, return Changed; } -static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { +static Function * +FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) { + // Hack to get a default TLI before we have actual Function. + auto FuncIter = M.begin(); + if (FuncIter == M.end()) + return nullptr; + auto *TLI = &GetTLI(*FuncIter); + LibFunc F = LibFunc_cxa_atexit; if (!TLI->has(F)) return nullptr; @@ -2818,6 +2852,9 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) { if (!Fn) return nullptr; + // Now get the actual TLI for Fn. + TLI = &GetTLI(*Fn); + // Make sure that the function has the correct prototype. if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit) return nullptr; @@ -2889,7 +2926,8 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) { } static bool optimizeGlobalsInModule( - Module &M, const DataLayout &DL, TargetLibraryInfo *TLI, + Module &M, const DataLayout &DL, + function_ref<TargetLibraryInfo &(Function &)> GetTLI, function_ref<TargetTransformInfo &(Function &)> GetTTI, function_ref<BlockFrequencyInfo &(Function &)> GetBFI, function_ref<DominatorTree &(Function &)> LookupDomTree) { @@ -2914,24 +2952,24 @@ static bool optimizeGlobalsInModule( NotDiscardableComdats.insert(C); // Delete functions that are trivially dead, ccc -> fastcc - LocalChange |= OptimizeFunctions(M, TLI, GetTTI, GetBFI, LookupDomTree, + LocalChange |= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree, NotDiscardableComdats); // Optimize global_ctors list. LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) { - return EvaluateStaticConstructor(F, DL, TLI); + return EvaluateStaticConstructor(F, DL, &GetTLI(*F)); }); // Optimize non-address-taken globals. - LocalChange |= OptimizeGlobalVars(M, TLI, LookupDomTree, - NotDiscardableComdats); + LocalChange |= + OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats); // Resolve aliases, when possible. LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats); // Try to remove trivial global destructors if they are not removed // already. - Function *CXAAtExitFn = FindCXAAtExit(M, TLI); + Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI); if (CXAAtExitFn) LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn); @@ -2946,12 +2984,14 @@ static bool optimizeGlobalsInModule( PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) { auto &DL = M.getDataLayout(); - auto &TLI = AM.getResult<TargetLibraryAnalysis>(M); auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{ return FAM.getResult<DominatorTreeAnalysis>(F); }; + auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult<TargetLibraryAnalysis>(F); + }; auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { return FAM.getResult<TargetIRAnalysis>(F); }; @@ -2960,7 +3000,7 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) { return FAM.getResult<BlockFrequencyAnalysis>(F); }; - if (!optimizeGlobalsInModule(M, DL, &TLI, GetTTI, GetBFI, LookupDomTree)) + if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } @@ -2979,10 +3019,12 @@ struct GlobalOptLegacyPass : public ModulePass { return false; auto &DL = M.getDataLayout(); - auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto LookupDomTree = [this](Function &F) -> DominatorTree & { return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); }; + auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }; auto GetTTI = [this](Function &F) -> TargetTransformInfo & { return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); }; @@ -2991,7 +3033,8 @@ struct GlobalOptLegacyPass : public ModulePass { return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); }; - return optimizeGlobalsInModule(M, DL, TLI, GetTTI, GetBFI, LookupDomTree); + return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, + LookupDomTree); } void getAnalysisUsage(AnalysisUsage &AU) const override { diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp index 060043a40b89..4a319ead23c0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/GlobalSplit.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/IPO.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp index ab1a9a79cad6..5e690714bfdf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/HotColdSplitting.cpp @@ -25,6 +25,7 @@ /// //===----------------------------------------------------------------------===// +#include "llvm/Transforms/IPO/HotColdSplitting.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -53,13 +54,14 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" -#include "llvm/Transforms/IPO/HotColdSplitting.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -85,12 +87,6 @@ static cl::opt<int> "multiple of TCC_Basic)")); namespace { - -/// A sequence of basic blocks. -/// -/// A 0-sized SmallVector is slightly cheaper to move than a std::vector. -using BlockSequence = SmallVector<BasicBlock *, 0>; - // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify // this function unless you modify the MBB version as well. // @@ -169,31 +165,6 @@ static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) { return Changed; } -class HotColdSplitting { -public: - HotColdSplitting(ProfileSummaryInfo *ProfSI, - function_ref<BlockFrequencyInfo *(Function &)> GBFI, - function_ref<TargetTransformInfo &(Function &)> GTTI, - std::function<OptimizationRemarkEmitter &(Function &)> *GORE, - function_ref<AssumptionCache *(Function &)> LAC) - : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE), LookupAC(LAC) {} - bool run(Module &M); - -private: - bool isFunctionCold(const Function &F) const; - bool shouldOutlineFrom(const Function &F) const; - bool outlineColdRegions(Function &F, bool HasProfileSummary); - Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT, - BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, - OptimizationRemarkEmitter &ORE, - AssumptionCache *AC, unsigned Count); - ProfileSummaryInfo *PSI; - function_ref<BlockFrequencyInfo *(Function &)> GetBFI; - function_ref<TargetTransformInfo &(Function &)> GetTTI; - std::function<OptimizationRemarkEmitter &(Function &)> *GetORE; - function_ref<AssumptionCache *(Function &)> LookupAC; -}; - class HotColdSplittingLegacyPass : public ModulePass { public: static char ID; @@ -236,6 +207,11 @@ bool HotColdSplitting::shouldOutlineFrom(const Function &F) const { if (F.hasFnAttribute(Attribute::NoInline)) return false; + // A function marked `noreturn` may contain unreachable terminators: these + // should not be considered cold, as the function may be a trampoline. + if (F.hasFnAttribute(Attribute::NoReturn)) + return false; + if (F.hasFnAttribute(Attribute::SanitizeAddress) || F.hasFnAttribute(Attribute::SanitizeHWAddress) || F.hasFnAttribute(Attribute::SanitizeThread) || @@ -321,13 +297,10 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region, return Penalty; } -Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, - DominatorTree &DT, - BlockFrequencyInfo *BFI, - TargetTransformInfo &TTI, - OptimizationRemarkEmitter &ORE, - AssumptionCache *AC, - unsigned Count) { +Function *HotColdSplitting::extractColdRegion( + const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC, + DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI, + OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) { assert(!Region.empty()); // TODO: Pass BFI and BPI to update profile information. @@ -349,7 +322,7 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, return nullptr; Function *OrigF = Region[0]->getParent(); - if (Function *OutF = CE.extractCodeRegion()) { + if (Function *OutF = CE.extractCodeRegion(CEAC)) { User *U = *OutF->user_begin(); CallInst *CI = cast<CallInst>(U); CallSite CS(CI); @@ -360,6 +333,9 @@ Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region, } CI->setIsNoInline(); + if (OrigF->hasSection()) + OutF->setSection(OrigF->getSection()); + markFunctionCold(*OutF, BFI != nullptr); LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF); @@ -607,9 +583,9 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { }); if (!DT) - DT = make_unique<DominatorTree>(F); + DT = std::make_unique<DominatorTree>(F); if (!PDT) - PDT = make_unique<PostDominatorTree>(F); + PDT = std::make_unique<PostDominatorTree>(F); auto Regions = OutliningRegion::create(*BB, *DT, *PDT); for (OutliningRegion &Region : Regions) { @@ -637,9 +613,14 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { } } + if (OutliningWorklist.empty()) + return Changed; + // Outline single-entry cold regions, splitting up larger regions as needed. unsigned OutlinedFunctionID = 1; - while (!OutliningWorklist.empty()) { + // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. + CodeExtractorAnalysisCache CEAC(F); + do { OutliningRegion Region = OutliningWorklist.pop_back_val(); assert(!Region.empty() && "Empty outlining region in worklist"); do { @@ -650,14 +631,14 @@ bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) { BB->dump(); }); - Function *Outlined = extractColdRegion(SubRegion, *DT, BFI, TTI, ORE, AC, - OutlinedFunctionID); + Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI, + ORE, AC, OutlinedFunctionID); if (Outlined) { ++OutlinedFunctionID; Changed = true; } } while (!Region.empty()); - } + } while (!OutliningWorklist.empty()); return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp index 7dc4d9ee9e34..1bda13a9bdd8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" using namespace llvm; @@ -254,7 +255,7 @@ static bool PropagateConstantReturn(Function &F) { // Find the index of the retval to replace with int index = -1; if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Ins)) - if (EV->hasIndices()) + if (EV->getNumIndices() == 1) index = *EV->idx_begin(); // If this use uses a specific return value, and we have a replacement, diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp index 34db75dd8b03..8a15800cbdb5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/IPO.cpp @@ -43,7 +43,7 @@ void llvm::initializeIPO(PassRegistry &Registry) { initializeBlockExtractorPass(Registry); initializeSingleLoopExtractorPass(Registry); initializeLowerTypeTestsPass(Registry); - initializeMergeFunctionsPass(Registry); + initializeMergeFunctionsLegacyPassPass(Registry); initializePartialInlinerLegacyPassPass(Registry); initializeAttributorLegacyPassPass(Registry); initializePostOrderFunctionAttrsLegacyPassPass(Registry); @@ -114,6 +114,10 @@ void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createIPSCCPPass()); } +void LLVMAddMergeFunctionsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createMergeFunctionsPass()); +} + void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) { auto PreserveMain = [=](const GlobalValue &GV) { return AllButMain && GV.getName() == "main"; @@ -121,6 +125,15 @@ void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) { unwrap(PM)->add(createInternalizePass(PreserveMain)); } +void LLVMAddInternalizePassWithMustPreservePredicate( + LLVMPassManagerRef PM, + void *Context, + LLVMBool (*Pred)(LLVMValueRef, void *)) { + unwrap(PM)->add(createInternalizePass([=](const GlobalValue &GV) { + return Pred(wrap(&GV), Context) == 0 ? false : true; + })); +} + void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createStripDeadPrototypesPass()); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp index 7f5511e008e1..685f8f7d7a00 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/InferFunctionAttrs.cpp @@ -11,6 +11,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" @@ -18,24 +19,28 @@ using namespace llvm; #define DEBUG_TYPE "inferattrs" -static bool inferAllPrototypeAttributes(Module &M, - const TargetLibraryInfo &TLI) { +static bool inferAllPrototypeAttributes( + Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) { bool Changed = false; for (Function &F : M.functions()) // We only infer things using the prototype and the name; we don't need // definitions. if (F.isDeclaration() && !F.hasOptNone()) - Changed |= inferLibFuncAttributes(F, TLI); + Changed |= inferLibFuncAttributes(F, GetTLI(F)); return Changed; } PreservedAnalyses InferFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) { - auto &TLI = AM.getResult<TargetLibraryAnalysis>(M); + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult<TargetLibraryAnalysis>(F); + }; - if (!inferAllPrototypeAttributes(M, TLI)) + if (!inferAllPrototypeAttributes(M, GetTLI)) // If we didn't infer anything, preserve all analyses. return PreservedAnalyses::all(); @@ -60,8 +65,10 @@ struct InferFunctionAttrsLegacyPass : public ModulePass { if (skipModule(M)) return false; - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - return inferAllPrototypeAttributes(M, TLI); + auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }; + return inferAllPrototypeAttributes(M, GetTLI); } }; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp index efb71b73cbb7..e818743544e6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/InlineSimple.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/Inliner.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp index 945f8affae6e..4b72261131c1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Inliner.cpp @@ -239,7 +239,7 @@ static void mergeInlinedArrayAllocas( } if (Align1 > Align2) - AvailableAlloca->setAlignment(AI->getAlignment()); + AvailableAlloca->setAlignment(MaybeAlign(AI->getAlignment())); } AI->eraseFromParent(); @@ -527,7 +527,8 @@ static void setInlineRemark(CallSite &CS, StringRef message) { static bool inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, std::function<AssumptionCache &(Function &)> GetAssumptionCache, - ProfileSummaryInfo *PSI, TargetLibraryInfo &TLI, + ProfileSummaryInfo *PSI, + std::function<TargetLibraryInfo &(Function &)> GetTLI, bool InsertLifetime, function_ref<InlineCost(CallSite CS)> GetInlineCost, function_ref<AAResults &(Function &)> AARGetter, @@ -626,7 +627,8 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, Instruction *Instr = CS.getInstruction(); - bool IsTriviallyDead = isInstructionTriviallyDead(Instr, &TLI); + bool IsTriviallyDead = + isInstructionTriviallyDead(Instr, &GetTLI(*Caller)); int InlineHistoryID; if (!IsTriviallyDead) { @@ -757,13 +759,16 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) { CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); ACT = &getAnalysis<AssumptionCacheTracker>(); PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto GetTLI = [&](Function &F) -> TargetLibraryInfo & { + return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }; auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & { return ACT->getAssumptionCache(F); }; - return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime, - [this](CallSite CS) { return getInlineCost(CS); }, - LegacyAARGetter(*this), ImportedFunctionsStats); + return inlineCallsImpl( + SCC, CG, GetAssumptionCache, PSI, GetTLI, InsertLifetime, + [this](CallSite CS) { return getInlineCost(CS); }, LegacyAARGetter(*this), + ImportedFunctionsStats); } /// Remove now-dead linkonce functions at the end of @@ -879,7 +884,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, if (!ImportedFunctionsStats && InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) { ImportedFunctionsStats = - llvm::make_unique<ImportedFunctionsInliningStatistics>(); + std::make_unique<ImportedFunctionsInliningStatistics>(); ImportedFunctionsStats->setModuleInfo(M); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp index 2e269604e379..e1644819af61 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/Internalize.cpp @@ -24,6 +24,7 @@ #include "llvm/ADT/StringSet.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp index 91c7b5f5f135..f7108e8002ac 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LoopExtractor.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" @@ -141,10 +142,12 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { if (NumLoops == 0) return Changed; --NumLoops; AssumptionCache *AC = nullptr; + Function &Func = *L->getHeader()->getParent(); if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>()) - AC = ACT->lookupAssumptionCache(*L->getHeader()->getParent()); + AC = ACT->lookupAssumptionCache(Func); + CodeExtractorAnalysisCache CEAC(Func); CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC); - if (Extractor.extractCodeRegion() != nullptr) { + if (Extractor.extractCodeRegion(CEAC) != nullptr) { Changed = true; // After extraction, the loop is replaced by a function call, so // we shouldn't try to run any more loop passes on it. diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index f7371284f47e..fa664966faf7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -54,6 +54,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/Casting.h" @@ -230,6 +231,16 @@ void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits, Bytes[AllocByteOffset + B] |= AllocMask; } +bool lowertypetests::isJumpTableCanonical(Function *F) { + if (F->isDeclarationForLinker()) + return false; + auto *CI = mdconst::extract_or_null<ConstantInt>( + F->getParent()->getModuleFlag("CFI Canonical Jump Tables")); + if (!CI || CI->getZExtValue() != 0) + return true; + return F->hasFnAttribute("cfi-canonical-jump-table"); +} + namespace { struct ByteArrayInfo { @@ -251,9 +262,12 @@ class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> { GlobalObject *GO; size_t NTypes; - // For functions: true if this is a definition (either in the merged module or - // in one of the thinlto modules). - bool IsDefinition; + // For functions: true if the jump table is canonical. This essentially means + // whether the canonical address (i.e. the symbol table entry) of the function + // is provided by the local jump table. This is normally the same as whether + // the function is defined locally, but if canonical jump tables are disabled + // by the user then the jump table never provides a canonical definition. + bool IsJumpTableCanonical; // For functions: true if this function is either defined or used in a thinlto // module and its jumptable entry needs to be exported to thinlto backends. @@ -263,13 +277,13 @@ class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> { public: static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO, - bool IsDefinition, bool IsExported, + bool IsJumpTableCanonical, bool IsExported, ArrayRef<MDNode *> Types) { auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate( totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember))); GTM->GO = GO; GTM->NTypes = Types.size(); - GTM->IsDefinition = IsDefinition; + GTM->IsJumpTableCanonical = IsJumpTableCanonical; GTM->IsExported = IsExported; std::uninitialized_copy(Types.begin(), Types.end(), GTM->getTrailingObjects<MDNode *>()); @@ -280,8 +294,8 @@ public: return GO; } - bool isDefinition() const { - return IsDefinition; + bool isJumpTableCanonical() const { + return IsJumpTableCanonical; } bool isExported() const { @@ -320,6 +334,49 @@ private: size_t NTargets; }; +struct ScopedSaveAliaseesAndUsed { + Module &M; + SmallPtrSet<GlobalValue *, 16> Used, CompilerUsed; + std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases; + + ScopedSaveAliaseesAndUsed(Module &M) : M(M) { + // The users of this class want to replace all function references except + // for aliases and llvm.used/llvm.compiler.used with references to a jump + // table. We avoid replacing aliases in order to avoid introducing a double + // indirection (or an alias pointing to a declaration in ThinLTO mode), and + // we avoid replacing llvm.used/llvm.compiler.used because these global + // variables describe properties of the global, not the jump table (besides, + // offseted references to the jump table in llvm.used are invalid). + // Unfortunately, LLVM doesn't have a "RAUW except for these (possibly + // indirect) users", so what we do is save the list of globals referenced by + // llvm.used/llvm.compiler.used and aliases, erase the used lists, let RAUW + // replace the aliasees and then set them back to their original values at + // the end. + if (GlobalVariable *GV = collectUsedGlobalVariables(M, Used, false)) + GV->eraseFromParent(); + if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true)) + GV->eraseFromParent(); + + for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) { + // FIXME: This should look past all aliases not just interposable ones, + // see discussion on D65118. + if (auto *F = + dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts())) + FunctionAliases.push_back({&GIS, F}); + } + } + + ~ScopedSaveAliaseesAndUsed() { + appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end())); + appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(), + CompilerUsed.end())); + + for (auto P : FunctionAliases) + P.first->setIndirectSymbol( + ConstantExpr::getBitCast(P.second, P.first->getType())); + } +}; + class LowerTypeTestsModule { Module &M; @@ -387,7 +444,8 @@ class LowerTypeTestsModule { uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL); TypeIdLowering importTypeId(StringRef TypeId); void importTypeTest(CallInst *CI); - void importFunction(Function *F, bool isDefinition); + void importFunction(Function *F, bool isJumpTableCanonical, + std::vector<GlobalAlias *> &AliasesToErase); BitSetInfo buildBitSet(Metadata *TypeId, @@ -421,7 +479,8 @@ class LowerTypeTestsModule { ArrayRef<GlobalTypeMember *> Globals, ArrayRef<ICallBranchFunnel *> ICallBranchFunnels); - void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT, bool IsDefinition); + void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT, + bool IsJumpTableCanonical); void moveInitializerToModuleConstructor(GlobalVariable *GV); void findGlobalVariableUsersOf(Constant *C, SmallSetVector<GlobalVariable *, 8> &Out); @@ -433,7 +492,7 @@ class LowerTypeTestsModule { /// the block. 'This's use list is expected to have at least one element. /// Unlike replaceAllUsesWith this function skips blockaddr and direct call /// uses. - void replaceCfiUses(Function *Old, Value *New, bool IsDefinition); + void replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical); /// replaceDirectCalls - Go through the uses list for this definition and /// replace each use, which is a direct function call. @@ -759,43 +818,50 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables( // Build a new global with the combined contents of the referenced globals. // This global is a struct whose even-indexed elements contain the original // contents of the referenced globals and whose odd-indexed elements contain - // any padding required to align the next element to the next power of 2. + // any padding required to align the next element to the next power of 2 plus + // any additional padding required to meet its alignment requirements. std::vector<Constant *> GlobalInits; const DataLayout &DL = M.getDataLayout(); + DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout; + Align MaxAlign; + uint64_t CurOffset = 0; + uint64_t DesiredPadding = 0; for (GlobalTypeMember *G : Globals) { - GlobalVariable *GV = cast<GlobalVariable>(G->getGlobal()); + auto *GV = cast<GlobalVariable>(G->getGlobal()); + MaybeAlign Alignment(GV->getAlignment()); + if (!Alignment) + Alignment = Align(DL.getABITypeAlignment(GV->getValueType())); + MaxAlign = std::max(MaxAlign, *Alignment); + uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, *Alignment); + GlobalLayout[G] = GVOffset; + if (GVOffset != 0) { + uint64_t Padding = GVOffset - CurOffset; + GlobalInits.push_back( + ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding))); + } + GlobalInits.push_back(GV->getInitializer()); uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType()); + CurOffset = GVOffset + InitSize; - // Compute the amount of padding required. - uint64_t Padding = NextPowerOf2(InitSize - 1) - InitSize; + // Compute the amount of padding that we'd like for the next element. + DesiredPadding = NextPowerOf2(InitSize - 1) - InitSize; // Experiments of different caps with Chromium on both x64 and ARM64 // have shown that the 32-byte cap generates the smallest binary on // both platforms while different caps yield similar performance. // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html) - if (Padding > 32) - Padding = alignTo(InitSize, 32) - InitSize; - - GlobalInits.push_back( - ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding))); + if (DesiredPadding > 32) + DesiredPadding = alignTo(InitSize, 32) - InitSize; } - if (!GlobalInits.empty()) - GlobalInits.pop_back(); + Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits); auto *CombinedGlobal = new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true, GlobalValue::PrivateLinkage, NewInit); + CombinedGlobal->setAlignment(MaxAlign); StructType *NewTy = cast<StructType>(NewInit->getType()); - const StructLayout *CombinedGlobalLayout = DL.getStructLayout(NewTy); - - // Compute the offsets of the original globals within the new global. - DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout; - for (unsigned I = 0; I != Globals.size(); ++I) - // Multiply by 2 to account for padding elements. - GlobalLayout[Globals[I]] = CombinedGlobalLayout->getElementOffset(I * 2); - lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout); // Build aliases pointing to offsets into the combined global for each @@ -975,14 +1041,16 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) { } // ThinLTO backend: the function F has a jump table entry; update this module -// accordingly. isDefinition describes the type of the jump table entry. -void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { +// accordingly. isJumpTableCanonical describes the type of the jump table entry. +void LowerTypeTestsModule::importFunction( + Function *F, bool isJumpTableCanonical, + std::vector<GlobalAlias *> &AliasesToErase) { assert(F->getType()->getAddressSpace() == 0); GlobalValue::VisibilityTypes Visibility = F->getVisibility(); std::string Name = F->getName(); - if (F->isDeclarationForLinker() && isDefinition) { + if (F->isDeclarationForLinker() && isJumpTableCanonical) { // Non-dso_local functions may be overriden at run time, // don't short curcuit them if (F->isDSOLocal()) { @@ -997,12 +1065,13 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { } Function *FDecl; - if (F->isDeclarationForLinker() && !isDefinition) { - // Declaration of an external function. + if (!isJumpTableCanonical) { + // Either a declaration of an external function or a reference to a locally + // defined jump table. FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, F->getAddressSpace(), Name + ".cfi_jt", &M); FDecl->setVisibility(GlobalValue::HiddenVisibility); - } else if (isDefinition) { + } else { F->setName(Name + ".cfi"); F->setLinkage(GlobalValue::ExternalLinkage); FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage, @@ -1011,8 +1080,8 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { Visibility = GlobalValue::HiddenVisibility; // Delete aliases pointing to this function, they'll be re-created in the - // merged output - SmallVector<GlobalAlias*, 4> ToErase; + // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed + // will want to reset the aliasees first. for (auto &U : F->uses()) { if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) { Function *AliasDecl = Function::Create( @@ -1020,24 +1089,15 @@ void LowerTypeTestsModule::importFunction(Function *F, bool isDefinition) { F->getAddressSpace(), "", &M); AliasDecl->takeName(A); A->replaceAllUsesWith(AliasDecl); - ToErase.push_back(A); + AliasesToErase.push_back(A); } } - for (auto *A : ToErase) - A->eraseFromParent(); - } else { - // Function definition without type metadata, where some other translation - // unit contained a declaration with type metadata. This normally happens - // during mixed CFI + non-CFI compilation. We do nothing with the function - // so that it is treated the same way as a function defined outside of the - // LTO unit. - return; } - if (F->isWeakForLinker()) - replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isDefinition); + if (F->hasExternalWeakLinkage()) + replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isJumpTableCanonical); else - replaceCfiUses(F, FDecl, isDefinition); + replaceCfiUses(F, FDecl, isJumpTableCanonical); // Set visibility late because it's used in replaceCfiUses() to determine // whether uses need to to be replaced. @@ -1225,7 +1285,7 @@ void LowerTypeTestsModule::findGlobalVariableUsersOf( // Replace all uses of F with (F ? JT : 0). void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( - Function *F, Constant *JT, bool IsDefinition) { + Function *F, Constant *JT, bool IsJumpTableCanonical) { // The target expression can not appear in a constant initializer on most // (all?) targets. Switch to a runtime initializer. SmallSetVector<GlobalVariable *, 8> GlobalVarUsers; @@ -1239,7 +1299,7 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( Function::Create(cast<FunctionType>(F->getValueType()), GlobalValue::ExternalWeakLinkage, F->getAddressSpace(), "", &M); - replaceCfiUses(F, PlaceholderFn, IsDefinition); + replaceCfiUses(F, PlaceholderFn, IsJumpTableCanonical); Constant *Target = ConstantExpr::getSelect( ConstantExpr::getICmp(CmpInst::ICMP_NE, F, @@ -1276,8 +1336,9 @@ selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions, unsigned ArmCount = 0, ThumbCount = 0; for (const auto GTM : Functions) { - if (!GTM->isDefinition()) { + if (!GTM->isJumpTableCanonical()) { // PLT stubs are always ARM. + // FIXME: This is the wrong heuristic for non-canonical jump tables. ++ArmCount; continue; } @@ -1303,7 +1364,7 @@ void LowerTypeTestsModule::createJumpTable( cast<Function>(Functions[I]->getGlobal())); // Align the whole table by entry size. - F->setAlignment(getJumpTableEntrySize()); + F->setAlignment(Align(getJumpTableEntrySize())); // Skip prologue. // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3. // Luckily, this function does not get any prologue even without the @@ -1438,47 +1499,53 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative( lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout); - // Build aliases pointing to offsets into the jump table, and replace - // references to the original functions with references to the aliases. - for (unsigned I = 0; I != Functions.size(); ++I) { - Function *F = cast<Function>(Functions[I]->getGlobal()); - bool IsDefinition = Functions[I]->isDefinition(); - - Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast( - ConstantExpr::getInBoundsGetElementPtr( - JumpTableType, JumpTable, - ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), - ConstantInt::get(IntPtrTy, I)}), - F->getType()); - if (Functions[I]->isExported()) { - if (IsDefinition) { - ExportSummary->cfiFunctionDefs().insert(F->getName()); + { + ScopedSaveAliaseesAndUsed S(M); + + // Build aliases pointing to offsets into the jump table, and replace + // references to the original functions with references to the aliases. + for (unsigned I = 0; I != Functions.size(); ++I) { + Function *F = cast<Function>(Functions[I]->getGlobal()); + bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical(); + + Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast( + ConstantExpr::getInBoundsGetElementPtr( + JumpTableType, JumpTable, + ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0), + ConstantInt::get(IntPtrTy, I)}), + F->getType()); + if (Functions[I]->isExported()) { + if (IsJumpTableCanonical) { + ExportSummary->cfiFunctionDefs().insert(F->getName()); + } else { + GlobalAlias *JtAlias = GlobalAlias::create( + F->getValueType(), 0, GlobalValue::ExternalLinkage, + F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M); + JtAlias->setVisibility(GlobalValue::HiddenVisibility); + ExportSummary->cfiFunctionDecls().insert(F->getName()); + } + } + if (!IsJumpTableCanonical) { + if (F->hasExternalWeakLinkage()) + replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, + IsJumpTableCanonical); + else + replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical); } else { - GlobalAlias *JtAlias = GlobalAlias::create( - F->getValueType(), 0, GlobalValue::ExternalLinkage, - F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M); - JtAlias->setVisibility(GlobalValue::HiddenVisibility); - ExportSummary->cfiFunctionDecls().insert(F->getName()); + assert(F->getType()->getAddressSpace() == 0); + + GlobalAlias *FAlias = + GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "", + CombinedGlobalElemPtr, &M); + FAlias->setVisibility(F->getVisibility()); + FAlias->takeName(F); + if (FAlias->hasName()) + F->setName(FAlias->getName() + ".cfi"); + replaceCfiUses(F, FAlias, IsJumpTableCanonical); + if (!F->hasLocalLinkage()) + F->setVisibility(GlobalVariable::HiddenVisibility); } } - if (!IsDefinition) { - if (F->isWeakForLinker()) - replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr, IsDefinition); - else - replaceCfiUses(F, CombinedGlobalElemPtr, IsDefinition); - } else { - assert(F->getType()->getAddressSpace() == 0); - - GlobalAlias *FAlias = GlobalAlias::create( - F->getValueType(), 0, F->getLinkage(), "", CombinedGlobalElemPtr, &M); - FAlias->setVisibility(F->getVisibility()); - FAlias->takeName(F); - if (FAlias->hasName()) - F->setName(FAlias->getName() + ".cfi"); - replaceCfiUses(F, FAlias, IsDefinition); - if (!F->hasLocalLinkage()) - F->setVisibility(GlobalVariable::HiddenVisibility); - } } createJumpTable(JumpTableFn, Functions); @@ -1623,7 +1690,7 @@ bool LowerTypeTestsModule::runForTesting(Module &M) { ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary + ": "); std::error_code EC; - raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text); + raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text); ExitOnErr(errorCodeToError(EC)); yaml::Output Out(OS); @@ -1643,7 +1710,8 @@ static bool isDirectCall(Use& U) { return false; } -void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefinition) { +void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, + bool IsJumpTableCanonical) { SmallSetVector<Constant *, 4> Constants; auto UI = Old->use_begin(), E = Old->use_end(); for (; UI != E;) { @@ -1655,7 +1723,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefi continue; // Skip direct calls to externally defined or non-dso_local functions - if (isDirectCall(U) && (Old->isDSOLocal() || !IsDefinition)) + if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical)) continue; // Must handle Constants specially, we cannot call replaceUsesOfWith on a @@ -1678,16 +1746,7 @@ void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New, bool IsDefi } void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) { - auto UI = Old->use_begin(), E = Old->use_end(); - for (; UI != E;) { - Use &U = *UI; - ++UI; - - if (!isDirectCall(U)) - continue; - - U.set(New); - } + Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); }); } bool LowerTypeTestsModule::lower() { @@ -1734,10 +1793,16 @@ bool LowerTypeTestsModule::lower() { Decls.push_back(&F); } - for (auto F : Defs) - importFunction(F, /*isDefinition*/ true); - for (auto F : Decls) - importFunction(F, /*isDefinition*/ false); + std::vector<GlobalAlias *> AliasesToErase; + { + ScopedSaveAliaseesAndUsed S(M); + for (auto F : Defs) + importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase); + for (auto F : Decls) + importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase); + } + for (GlobalAlias *GA : AliasesToErase) + GA->eraseFromParent(); return true; } @@ -1746,7 +1811,7 @@ bool LowerTypeTestsModule::lower() { // reference them. This is used to partition the set of type identifiers in // the module into disjoint sets. using GlobalClassesTy = EquivalenceClasses< - PointerUnion3<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>; + PointerUnion<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>; GlobalClassesTy GlobalClasses; // Verify the type metadata and build a few data structures to let us @@ -1823,6 +1888,17 @@ bool LowerTypeTestsModule::lower() { CfiFunctionLinkage Linkage = P.second.Linkage; MDNode *FuncMD = P.second.FuncMD; Function *F = M.getFunction(FunctionName); + if (F && F->hasLocalLinkage()) { + // Locally defined function that happens to have the same name as a + // function defined in a ThinLTO module. Rename it to move it out of + // the way of the external reference that we're about to create. + // Note that setName will find a unique name for the function, so even + // if there is an existing function with the suffix there won't be a + // name collision. + F->setName(F->getName() + ".1"); + F = nullptr; + } + if (!F) F = Function::Create( FunctionType::get(Type::getVoidTy(M.getContext()), false), @@ -1871,24 +1947,26 @@ bool LowerTypeTestsModule::lower() { Types.clear(); GO.getMetadata(LLVMContext::MD_type, Types); - bool IsDefinition = !GO.isDeclarationForLinker(); + bool IsJumpTableCanonical = false; bool IsExported = false; if (Function *F = dyn_cast<Function>(&GO)) { + IsJumpTableCanonical = isJumpTableCanonical(F); if (ExportedFunctions.count(F->getName())) { - IsDefinition |= ExportedFunctions[F->getName()].Linkage == CFL_Definition; + IsJumpTableCanonical |= + ExportedFunctions[F->getName()].Linkage == CFL_Definition; IsExported = true; // TODO: The logic here checks only that the function is address taken, // not that the address takers are live. This can be updated to check // their liveness and emit fewer jumptable entries once monolithic LTO // builds also emit summaries. } else if (!F->hasAddressTaken()) { - if (!CrossDsoCfi || !IsDefinition || F->hasLocalLinkage()) + if (!CrossDsoCfi || !IsJumpTableCanonical || F->hasLocalLinkage()) continue; } } - auto *GTM = - GlobalTypeMember::create(Alloc, &GO, IsDefinition, IsExported, Types); + auto *GTM = GlobalTypeMember::create(Alloc, &GO, IsJumpTableCanonical, + IsExported, Types); GlobalTypeMembers[&GO] = GTM; for (MDNode *Type : Types) { verifyTypeMDNode(&GO, Type); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 3a08069dcd4a..06d2a2f31941 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -115,12 +115,14 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/MergeFunctions.h" #include "llvm/Transforms/Utils/FunctionComparator.h" #include <algorithm> #include <cassert> @@ -195,16 +197,12 @@ public: /// by considering all pointer types to be equivalent. Once identified, /// MergeFunctions will fold them by replacing a call to one to a call to a /// bitcast of the other. -class MergeFunctions : public ModulePass { +class MergeFunctions { public: - static char ID; - - MergeFunctions() - : ModulePass(ID), FnTree(FunctionNodeCmp(&GlobalNumbers)) { - initializeMergeFunctionsPass(*PassRegistry::getPassRegistry()); + MergeFunctions() : FnTree(FunctionNodeCmp(&GlobalNumbers)) { } - bool runOnModule(Module &M) override; + bool runOnModule(Module &M); private: // The function comparison operator is provided here so that FunctionNodes do @@ -297,14 +295,39 @@ private: DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree; }; -} // end anonymous namespace +class MergeFunctionsLegacyPass : public ModulePass { +public: + static char ID; + + MergeFunctionsLegacyPass(): ModulePass(ID) { + initializeMergeFunctionsLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + if (skipModule(M)) + return false; -char MergeFunctions::ID = 0; + MergeFunctions MF; + return MF.runOnModule(M); + } +}; -INITIALIZE_PASS(MergeFunctions, "mergefunc", "Merge Functions", false, false) +} // end anonymous namespace + +char MergeFunctionsLegacyPass::ID = 0; +INITIALIZE_PASS(MergeFunctionsLegacyPass, "mergefunc", + "Merge Functions", false, false) ModulePass *llvm::createMergeFunctionsPass() { - return new MergeFunctions(); + return new MergeFunctionsLegacyPass(); +} + +PreservedAnalyses MergeFunctionsPass::run(Module &M, + ModuleAnalysisManager &AM) { + MergeFunctions MF; + if (!MF.runOnModule(M)) + return PreservedAnalyses::all(); + return PreservedAnalyses::none(); } #ifndef NDEBUG @@ -386,9 +409,6 @@ static bool isEligibleForMerging(Function &F) { } bool MergeFunctions::runOnModule(Module &M) { - if (skipModule(M)) - return false; - bool Changed = false; // All functions in the module, ordered by hash. Functions with a unique @@ -449,28 +469,10 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { ++UI; CallSite CS(U->getUser()); if (CS && CS.isCallee(U)) { - // Transfer the called function's attributes to the call site. Due to the - // bitcast we will 'lose' ABI changing attributes because the 'called - // function' is no longer a Function* but the bitcast. Code that looks up - // the attributes from the called function will fail. - - // FIXME: This is not actually true, at least not anymore. The callsite - // will always have the same ABI affecting attributes as the callee, - // because otherwise the original input has UB. Note that Old and New - // always have matching ABI, so no attributes need to be changed. - // Transferring other attributes may help other optimizations, but that - // should be done uniformly and not in this ad-hoc way. - auto &Context = New->getContext(); - auto NewPAL = New->getAttributes(); - SmallVector<AttributeSet, 4> NewArgAttrs; - for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) - NewArgAttrs.push_back(NewPAL.getParamAttributes(argIdx)); - // Don't transfer attributes from the function to the callee. Function - // attributes typically aren't relevant to the calling convention or ABI. - CS.setAttributes(AttributeList::get(Context, /*FnAttrs=*/AttributeSet(), - NewPAL.getRetAttributes(), - NewArgAttrs)); - + // Do not copy attributes from the called function to the call-site. + // Function comparison ensures that the attributes are the same up to + // type congruences in byval(), in which case we need to keep the byval + // type of the call-site, not the callee function. remove(CS.getInstruction()->getFunction()); U->set(BitcastNew); } @@ -769,7 +771,7 @@ void MergeFunctions::writeAlias(Function *F, Function *G) { PtrType->getElementType(), PtrType->getAddressSpace(), G->getLinkage(), "", BitcastF, G->getParent()); - F->setAlignment(std::max(F->getAlignment(), G->getAlignment())); + F->setAlignment(MaybeAlign(std::max(F->getAlignment(), G->getAlignment()))); GA->takeName(G); GA->setVisibility(G->getVisibility()); GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); @@ -816,7 +818,7 @@ void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) { removeUsers(F); F->replaceAllUsesWith(NewF); - unsigned MaxAlignment = std::max(G->getAlignment(), NewF->getAlignment()); + MaybeAlign MaxAlignment(std::max(G->getAlignment(), NewF->getAlignment())); writeThunkOrAlias(F, G); writeThunkOrAlias(F, NewF); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp index 733782e8764d..cd3701e90308 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PartialInlining.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/User.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" @@ -409,7 +410,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = - llvm::make_unique<FunctionOutliningMultiRegionInfo>(); + std::make_unique<FunctionOutliningMultiRegionInfo>(); auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { BasicBlock *Dom = BlockList.front(); @@ -589,7 +590,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { }; std::unique_ptr<FunctionOutliningInfo> OutliningInfo = - llvm::make_unique<FunctionOutliningInfo>(); + std::make_unique<FunctionOutliningInfo>(); BasicBlock *CurrEntry = EntryBlock; bool CandidateFound = false; @@ -701,7 +702,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { return OutliningInfo; } -// Check if there is PGO data or user annoated branch data: +// Check if there is PGO data or user annotated branch data: static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { if (F->hasProfileData()) return true; @@ -966,7 +967,7 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner( Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, function_ref<AssumptionCache *(Function &)> LookupAC) : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { - ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); + ClonedOI = std::make_unique<FunctionOutliningInfo>(); // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; @@ -991,7 +992,7 @@ PartialInlinerImpl::FunctionCloner::FunctionCloner( OptimizationRemarkEmitter &ORE, function_ref<AssumptionCache *(Function &)> LookupAC) : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { - ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>(); + ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; @@ -1122,6 +1123,9 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { BranchProbabilityInfo BPI(*ClonedFunc, LI); ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); + // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. + CodeExtractorAnalysisCache CEAC(*ClonedFunc); + SetVector<Value *> Inputs, Outputs, Sinks; for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : ClonedOMRI->ORI) { @@ -1148,7 +1152,7 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { if (Outputs.size() > 0 && !ForceLiveExit) continue; - Function *OutlinedFunc = CE.extractCodeRegion(); + Function *OutlinedFunc = CE.extractCodeRegion(CEAC); if (OutlinedFunc) { CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); @@ -1210,11 +1214,12 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { } // Extract the body of the if. + CodeExtractorAnalysisCache CEAC(*ClonedFunc); Function *OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), /* AllowVarargs */ true) - .extractCodeRegion(); + .extractCodeRegion(CEAC); if (OutlinedFunc) { BasicBlock *OutliningCallBB = @@ -1264,7 +1269,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { if (PSI->isFunctionEntryCold(F)) return {false, nullptr}; - if (empty(F->users())) + if (F->users().empty()) return {false, nullptr}; OptimizationRemarkEmitter ORE(F); @@ -1370,7 +1375,7 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { return false; } - assert(empty(Cloner.OrigFunc->users()) && + assert(Cloner.OrigFunc->users().empty() && "F's users should all be replaced!"); std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 3ea77f08fd3c..7cfc29f7bf7a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -13,6 +13,7 @@ #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm-c/Transforms/PassManagerBuilder.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CFLAndersAliasAnalysis.h" @@ -147,6 +148,10 @@ cl::opt<bool> EnableOrderFileInstrumentation( "enable-order-file-instrumentation", cl::init(false), cl::Hidden, cl::desc("Enable order file instrumentation (default = off)")); +static cl::opt<bool> + EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, + cl::desc("Enable lowering of the matrix intrinsics")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -183,8 +188,13 @@ PassManagerBuilder::~PassManagerBuilder() { } /// Set of global extensions, automatically added as part of the standard set. -static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy, - PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions; +static ManagedStatic< + SmallVector<std::tuple<PassManagerBuilder::ExtensionPointTy, + PassManagerBuilder::ExtensionFn, + PassManagerBuilder::GlobalExtensionID>, + 8>> + GlobalExtensions; +static PassManagerBuilder::GlobalExtensionID GlobalExtensionsCounter; /// Check if GlobalExtensions is constructed and not empty. /// Since GlobalExtensions is a managed static, calling 'empty()' will trigger @@ -193,10 +203,29 @@ static bool GlobalExtensionsNotEmpty() { return GlobalExtensions.isConstructed() && !GlobalExtensions->empty(); } -void PassManagerBuilder::addGlobalExtension( - PassManagerBuilder::ExtensionPointTy Ty, - PassManagerBuilder::ExtensionFn Fn) { - GlobalExtensions->push_back(std::make_pair(Ty, std::move(Fn))); +PassManagerBuilder::GlobalExtensionID +PassManagerBuilder::addGlobalExtension(PassManagerBuilder::ExtensionPointTy Ty, + PassManagerBuilder::ExtensionFn Fn) { + auto ExtensionID = GlobalExtensionsCounter++; + GlobalExtensions->push_back(std::make_tuple(Ty, std::move(Fn), ExtensionID)); + return ExtensionID; +} + +void PassManagerBuilder::removeGlobalExtension( + PassManagerBuilder::GlobalExtensionID ExtensionID) { + // RegisterStandardPasses may try to call this function after GlobalExtensions + // has already been destroyed; doing so should not generate an error. + if (!GlobalExtensions.isConstructed()) + return; + + auto GlobalExtension = + llvm::find_if(*GlobalExtensions, [ExtensionID](const auto &elem) { + return std::get<2>(elem) == ExtensionID; + }); + assert(GlobalExtension != GlobalExtensions->end() && + "The extension ID to be removed should always be valid."); + + GlobalExtensions->erase(GlobalExtension); } void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) { @@ -207,8 +236,8 @@ void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy, legacy::PassManagerBase &PM) const { if (GlobalExtensionsNotEmpty()) { for (auto &Ext : *GlobalExtensions) { - if (Ext.first == ETy) - Ext.second(*this, PM); + if (std::get<0>(Ext) == ETy) + std::get<1>(Ext)(*this, PM); } } for (unsigned i = 0, e = Extensions.size(); i != e; ++i) @@ -320,19 +349,26 @@ void PassManagerBuilder::addFunctionSimplificationPasses( legacy::PassManagerBase &MPM) { // Start of function pass. // Break up aggregate allocas, using SSAUpdater. + assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!"); MPM.add(createSROAPass()); MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies - if (EnableGVNHoist) - MPM.add(createGVNHoistPass()); - if (EnableGVNSink) { - MPM.add(createGVNSinkPass()); - MPM.add(createCFGSimplificationPass()); + + if (OptLevel > 1) { + if (EnableGVNHoist) + MPM.add(createGVNHoistPass()); + if (EnableGVNSink) { + MPM.add(createGVNSinkPass()); + MPM.add(createCFGSimplificationPass()); + } } - // Speculative execution if the target has divergent branches; otherwise nop. - MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); - MPM.add(createJumpThreadingPass()); // Thread jumps. - MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals + if (OptLevel > 1) { + // Speculative execution if the target has divergent branches; otherwise nop. + MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass()); + + MPM.add(createJumpThreadingPass()); // Thread jumps. + MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals + } MPM.add(createCFGSimplificationPass()); // Merge & remove BBs // Combine silly seq's if (OptLevel > 2) @@ -346,8 +382,10 @@ void PassManagerBuilder::addFunctionSimplificationPasses( if (SizeLevel == 0) MPM.add(createPGOMemOPSizeOptLegacyPass()); - MPM.add(createTailCallEliminationPass()); // Eliminate tail calls - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + // TODO: Investigate the cost/benefit of tail call elimination on debugging. + if (OptLevel > 1) + MPM.add(createTailCallEliminationPass()); // Eliminate tail calls + MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createReassociatePass()); // Reassociate expressions // Begin the loop pass pipeline. @@ -360,6 +398,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( } // Rotate Loop - disable header duplication at -Oz MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + // TODO: Investigate promotion cap for O1. MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); if (EnableSimpleLoopUnswitch) MPM.add(createSimpleLoopUnswitchLegacyPass()); @@ -402,16 +441,19 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // opened up by them. addInstructionCombiningPass(MPM); addExtensionsToPM(EP_Peephole, MPM); - MPM.add(createJumpThreadingPass()); // Thread jumps - MPM.add(createCorrelatedValuePropagationPass()); - MPM.add(createDeadStoreEliminationPass()); // Delete dead stores - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + if (OptLevel > 1) { + MPM.add(createJumpThreadingPass()); // Thread jumps + MPM.add(createCorrelatedValuePropagationPass()); + MPM.add(createDeadStoreEliminationPass()); // Delete dead stores + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + } addExtensionsToPM(EP_ScalarOptimizerLate, MPM); if (RerollLoops) MPM.add(createLoopRerollPass()); + // TODO: Investigate if this is too expensive at O1. MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs // Clean up after everything. @@ -654,6 +696,15 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createGlobalsAAWrapperPass()); MPM.add(createFloat2IntPass()); + MPM.add(createLowerConstantIntrinsicsPass()); + + if (EnableMatrix) { + MPM.add(createLowerMatrixIntrinsicsPass()); + // CSE the pointer arithmetic of the column vectors. This allows alias + // analysis to establish no-aliasing between loads and stores of different + // columns of the same matrix. + MPM.add(createEarlyCSEPass(false)); + } addExtensionsToPM(EP_VectorizerStart, MPM); @@ -898,7 +949,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // LTO provides additional opportunities for tailcall elimination due to // link-time inlining, and visibility of nocapture attribute. - PM.add(createTailCallEliminationPass()); + if (OptLevel > 1) + PM.add(createTailCallEliminationPass()); // Infer attributes on declarations, call sites, arguments, etc. PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture. diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp index cb3915dfb678..45a0ce20eb17 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/PruneEH.cpp @@ -18,15 +18,16 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/EHPersonalities.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp index 7be3608bd2ec..fdffffba0c2d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SCCP.cpp @@ -2,6 +2,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar/SCCP.h" @@ -9,16 +10,18 @@ using namespace llvm; PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) { const DataLayout &DL = M.getDataLayout(); - auto &TLI = AM.getResult<TargetLibraryAnalysis>(M); auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & { + return FAM.getResult<TargetLibraryAnalysis>(F); + }; auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn { DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); return { - make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)), + std::make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)), &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F)}; }; - if (!runIPSCCP(M, DL, &TLI, getAnalysis)) + if (!runIPSCCP(M, DL, GetTLI, getAnalysis)) return PreservedAnalyses::all(); PreservedAnalyses PA; @@ -47,14 +50,14 @@ public: if (skipModule(M)) return false; const DataLayout &DL = M.getDataLayout(); - const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - + auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & { + return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }; auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn { DominatorTree &DT = this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); return { - make_unique<PredicateInfo>( + std::make_unique<PredicateInfo>( F, DT, this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache( F)), @@ -62,7 +65,7 @@ public: nullptr}; // manager, so set them to nullptr. }; - return runIPSCCP(M, DL, TLI, getAnalysis); + return runIPSCCP(M, DL, GetTLI, getAnalysis); } void getAnalysisUsage(AnalysisUsage &AU) const override { diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp index 877d20e72ffc..a1fbb1adc412 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -26,13 +26,17 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" @@ -57,6 +61,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/ValueSymbolTable.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/SampleProf.h" @@ -72,6 +77,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/MisExpect.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -79,6 +85,7 @@ #include <limits> #include <map> #include <memory> +#include <queue> #include <string> #include <system_error> #include <utility> @@ -88,6 +95,12 @@ using namespace llvm; using namespace sampleprof; using ProfileCount = Function::ProfileCount; #define DEBUG_TYPE "sample-profile" +#define CSINLINE_DEBUG DEBUG_TYPE "-inline" + +STATISTIC(NumCSInlined, + "Number of functions inlined with context sensitive profile"); +STATISTIC(NumCSNotInlined, + "Number of functions not inlined with context sensitive profile"); // Command line option to specify the file to read samples from. This is // mainly used for debugging. @@ -128,6 +141,31 @@ static cl::opt<bool> ProfileSampleAccurate( "callsite and function as having 0 samples. Otherwise, treat " "un-sampled callsites and functions conservatively as unknown. ")); +static cl::opt<bool> ProfileAccurateForSymsInList( + "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore, + cl::init(true), + cl::desc("For symbols in profile symbol list, regard their profiles to " + "be accurate. It may be overriden by profile-sample-accurate. ")); + +static cl::opt<bool> ProfileMergeInlinee( + "sample-profile-merge-inlinee", cl::Hidden, cl::init(false), + cl::desc("Merge past inlinee's profile to outline version if sample " + "profile loader decided not to inline a call site.")); + +static cl::opt<bool> ProfileTopDownLoad( + "sample-profile-top-down-load", cl::Hidden, cl::init(false), + cl::desc("Do profile annotation and inlining for functions in top-down " + "order of call graph during sample profile loading.")); + +static cl::opt<bool> ProfileSizeInline( + "sample-profile-inline-size", cl::Hidden, cl::init(false), + cl::desc("Inline cold call sites in profile loader if it's beneficial " + "for code size.")); + +static cl::opt<int> SampleColdCallSiteThreshold( + "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45), + cl::desc("Threshold for inlining cold callsites")); + namespace { using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>; @@ -137,9 +175,11 @@ using EdgeWeightMap = DenseMap<Edge, uint64_t>; using BlockEdgeMap = DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>; +class SampleProfileLoader; + class SampleCoverageTracker { public: - SampleCoverageTracker() = default; + SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){}; bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, uint32_t Discriminator, uint64_t Samples); @@ -185,6 +225,76 @@ private: /// keyed by FunctionSamples pointers, but these stats are cleared after /// every function, so we just need to keep a single counter. uint64_t TotalUsedSamples = 0; + + SampleProfileLoader &SPLoader; +}; + +class GUIDToFuncNameMapper { +public: + GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader, + DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap) + : CurrentReader(Reader), CurrentModule(M), + CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) { + if (CurrentReader.getFormat() != SPF_Compact_Binary) + return; + + for (const auto &F : CurrentModule) { + StringRef OrigName = F.getName(); + CurrentGUIDToFuncNameMap.insert( + {Function::getGUID(OrigName), OrigName}); + + // Local to global var promotion used by optimization like thinlto + // will rename the var and add suffix like ".llvm.xxx" to the + // original local name. In sample profile, the suffixes of function + // names are all stripped. Since it is possible that the mapper is + // built in post-thin-link phase and var promotion has been done, + // we need to add the substring of function name without the suffix + // into the GUIDToFuncNameMap. + StringRef CanonName = FunctionSamples::getCanonicalFnName(F); + if (CanonName != OrigName) + CurrentGUIDToFuncNameMap.insert( + {Function::getGUID(CanonName), CanonName}); + } + + // Update GUIDToFuncNameMap for each function including inlinees. + SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap); + } + + ~GUIDToFuncNameMapper() { + if (CurrentReader.getFormat() != SPF_Compact_Binary) + return; + + CurrentGUIDToFuncNameMap.clear(); + + // Reset GUIDToFuncNameMap for of each function as they're no + // longer valid at this point. + SetGUIDToFuncNameMapForAll(nullptr); + } + +private: + void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) { + std::queue<FunctionSamples *> FSToUpdate; + for (auto &IFS : CurrentReader.getProfiles()) { + FSToUpdate.push(&IFS.second); + } + + while (!FSToUpdate.empty()) { + FunctionSamples *FS = FSToUpdate.front(); + FSToUpdate.pop(); + FS->GUIDToFuncNameMap = Map; + for (const auto &ICS : FS->getCallsiteSamples()) { + const FunctionSamplesMap &FSMap = ICS.second; + for (auto &IFS : FSMap) { + FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second); + FSToUpdate.push(&FS); + } + } + } + } + + SampleProfileReader &CurrentReader; + Module &CurrentModule; + DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap; }; /// Sample profile pass. @@ -199,16 +309,19 @@ public: std::function<AssumptionCache &(Function &)> GetAssumptionCache, std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo) : GetAC(std::move(GetAssumptionCache)), - GetTTI(std::move(GetTargetTransformInfo)), Filename(Name), - RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {} + GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this), + Filename(Name), RemappingFilename(RemapName), + IsThinLTOPreLink(IsThinLTOPreLink) {} bool doInitialization(Module &M); bool runOnModule(Module &M, ModuleAnalysisManager *AM, - ProfileSummaryInfo *_PSI); + ProfileSummaryInfo *_PSI, CallGraph *CG); void dump() { Reader->dump(); } protected: + friend class SampleCoverageTracker; + bool runOnFunction(Function &F, ModuleAnalysisManager *AM); unsigned getFunctionLoc(Function &F); bool emitAnnotations(Function &F); @@ -222,6 +335,10 @@ protected: bool inlineCallInstruction(Instruction *I); bool inlineHotFunctions(Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs); + // Inline cold/small functions in addition to hot ones + bool shouldInlineColdCallee(Instruction &CallInst); + void emitOptimizationRemarksForInlineCandidates( + const SmallVector<Instruction *, 10> &Candidates, const Function &F, bool Hot); void printEdgeWeight(raw_ostream &OS, Edge E); void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const; void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB); @@ -234,9 +351,12 @@ protected: void propagateWeights(Function &F); uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(Function &F); + std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG); bool propagateThroughEdges(Function &F, bool UpdateBlockCount); void computeDominanceAndLoopInfo(Function &F); void clearFunctionData(); + bool callsiteIsHot(const FunctionSamples *CallsiteFS, + ProfileSummaryInfo *PSI); /// Map basic blocks to their computed weights. /// @@ -310,6 +430,10 @@ protected: /// Profile Summary Info computed from sample profile. ProfileSummaryInfo *PSI = nullptr; + /// Profle Symbol list tells whether a function name appears in the binary + /// used to generate the current profile. + std::unique_ptr<ProfileSymbolList> PSL; + /// Total number of samples collected in this profile. /// /// This is the sum of all the samples collected in all the functions executed @@ -326,6 +450,21 @@ protected: uint64_t entryCount; }; DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo; + + // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for + // all the function symbols defined or declared in current module. + DenseMap<uint64_t, StringRef> GUIDToFuncNameMap; + + // All the Names used in FunctionSamples including outline function + // names, inline instance names and call target names. + StringSet<> NamesInProfile; + + // For symbol in profile symbol list, whether to regard their profiles + // to be accurate. It is mainly decided by existance of profile symbol + // list and -profile-accurate-for-symsinlist flag, but it can be + // overriden by -profile-sample-accurate or profile-sample-accurate + // attribute. + bool ProfAccForSymsInList; }; class SampleProfileLoaderLegacyPass : public ModulePass { @@ -381,14 +520,23 @@ private: /// To decide whether an inlined callsite is hot, we compare the callsite /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is /// regarded as hot if the count is above the cutoff value. -static bool callsiteIsHot(const FunctionSamples *CallsiteFS, - ProfileSummaryInfo *PSI) { +/// +/// When ProfileAccurateForSymsInList is enabled and profile symbol list +/// is present, functions in the profile symbol list but without profile will +/// be regarded as cold and much less inlining will happen in CGSCC inlining +/// pass, so we tend to lower the hot criteria here to allow more early +/// inlining to happen for warm callsites and it is helpful for performance. +bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS, + ProfileSummaryInfo *PSI) { if (!CallsiteFS) return false; // The callsite was not inlined in the original binary. assert(PSI && "PSI is expected to be non null"); uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); - return PSI->isHotCount(CallsiteTotalSamples); + if (ProfAccForSymsInList) + return !PSI->isColdCount(CallsiteTotalSamples); + else + return PSI->isHotCount(CallsiteTotalSamples); } /// Mark as used the sample record for the given function samples at @@ -425,7 +573,7 @@ SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS, for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(CalleeSamples, PSI)) + if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) Count += countUsedRecords(CalleeSamples, PSI); } @@ -444,7 +592,7 @@ SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS, for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(CalleeSamples, PSI)) + if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) Count += countBodyRecords(CalleeSamples, PSI); } @@ -465,7 +613,7 @@ SampleCoverageTracker::countBodySamples(const FunctionSamples *FS, for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(CalleeSamples, PSI)) + if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) Total += countBodySamples(CalleeSamples, PSI); } @@ -756,21 +904,52 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) { getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC, None, nullptr, nullptr); if (Cost.isNever()) { - ORE->emit(OptimizationRemark(DEBUG_TYPE, "Not inline", DLoc, BB) + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB) << "incompatible inlining"); return false; } InlineFunctionInfo IFI(nullptr, &GetAC); if (InlineFunction(CS, IFI)) { // The call to InlineFunction erases I, so we can't pass it here. - ORE->emit(OptimizationRemark(DEBUG_TYPE, "HotInline", DLoc, BB) - << "inlined hot callee '" << ore::NV("Callee", CalledFunction) + ORE->emit(OptimizationRemark(CSINLINE_DEBUG, "InlineSuccess", DLoc, BB) + << "inlined callee '" << ore::NV("Callee", CalledFunction) << "' into '" << ore::NV("Caller", BB->getParent()) << "'"); return true; } return false; } +bool SampleProfileLoader::shouldInlineColdCallee(Instruction &CallInst) { + if (!ProfileSizeInline) + return false; + + Function *Callee = CallSite(&CallInst).getCalledFunction(); + if (Callee == nullptr) + return false; + + InlineCost Cost = + getInlineCost(cast<CallBase>(CallInst), getInlineParams(), + GetTTI(*Callee), GetAC, None, nullptr, nullptr); + + return Cost.getCost() <= SampleColdCallSiteThreshold; +} + +void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates( + const SmallVector<Instruction *, 10> &Candidates, const Function &F, + bool Hot) { + for (auto I : Candidates) { + Function *CalledFunction = CallSite(I).getCalledFunction(); + if (CalledFunction) { + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt", + I->getDebugLoc(), I->getParent()) + << "previous inlining reattempted for " + << (Hot ? "hotness: '" : "size: '") + << ore::NV("Callee", CalledFunction) << "' into '" + << ore::NV("Caller", &F) << "'"); + } + } +} + /// Iteratively inline hot callsites of a function. /// /// Iteratively traverse all callsites of the function \p F, and find if @@ -788,6 +967,14 @@ bool SampleProfileLoader::inlineHotFunctions( Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) { DenseSet<Instruction *> PromotedInsns; + // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure + // Profile symbol list is ignored when profile-sample-accurate is on. + assert((!ProfAccForSymsInList || + (!ProfileSampleAccurate && + !F.hasFnAttribute("profile-sample-accurate"))) && + "ProfAccForSymsInList should be false when profile-sample-accurate " + "is enabled"); + DenseMap<Instruction *, const FunctionSamples *> localNotInlinedCallSites; bool Changed = false; while (true) { @@ -795,20 +982,28 @@ bool SampleProfileLoader::inlineHotFunctions( SmallVector<Instruction *, 10> CIS; for (auto &BB : F) { bool Hot = false; - SmallVector<Instruction *, 10> Candidates; + SmallVector<Instruction *, 10> AllCandidates; + SmallVector<Instruction *, 10> ColdCandidates; for (auto &I : BB.getInstList()) { const FunctionSamples *FS = nullptr; if ((isa<CallInst>(I) || isa<InvokeInst>(I)) && !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) { - Candidates.push_back(&I); + AllCandidates.push_back(&I); if (FS->getEntrySamples() > 0) localNotInlinedCallSites.try_emplace(&I, FS); if (callsiteIsHot(FS, PSI)) Hot = true; + else if (shouldInlineColdCallee(I)) + ColdCandidates.push_back(&I); } } if (Hot) { - CIS.insert(CIS.begin(), Candidates.begin(), Candidates.end()); + CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end()); + emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true); + } + else { + CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end()); + emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false); } } for (auto I : CIS) { @@ -854,6 +1049,7 @@ bool SampleProfileLoader::inlineHotFunctions( inlineCallInstruction(DI)) { localNotInlinedCallSites.erase(I); LocalChanged = true; + ++NumCSInlined; } } else { LLVM_DEBUG(dbgs() @@ -866,6 +1062,7 @@ bool SampleProfileLoader::inlineHotFunctions( if (inlineCallInstruction(I)) { localNotInlinedCallSites.erase(I); LocalChanged = true; + ++NumCSInlined; } } else if (IsThinLTOPreLink) { findCalleeFunctionSamples(*I)->findInlinedFunctions( @@ -885,10 +1082,35 @@ bool SampleProfileLoader::inlineHotFunctions( Function *Callee = CallSite(I).getCalledFunction(); if (!Callee || Callee->isDeclaration()) continue; + + ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline", + I->getDebugLoc(), I->getParent()) + << "previous inlining not repeated: '" + << ore::NV("Callee", Callee) << "' into '" + << ore::NV("Caller", &F) << "'"); + + ++NumCSNotInlined; const FunctionSamples *FS = Pair.getSecond(); - auto pair = - notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); - pair.first->second.entryCount += FS->getEntrySamples(); + if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { + continue; + } + + if (ProfileMergeInlinee) { + // Use entry samples as head samples during the merge, as inlinees + // don't have head samples. + assert(FS->getHeadSamples() == 0 && "Expect 0 head sample for inlinee"); + const_cast<FunctionSamples *>(FS)->addHeadSamples(FS->getEntrySamples()); + + // Note that we have to do the merge right after processing function. + // This allows OutlineFS's profile to be used for annotation during + // top-down processing of functions' annotation. + FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); + OutlineFS->merge(*FS); + } else { + auto pair = + notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); + pair.first->second.entryCount += FS->getEntrySamples(); + } } return Changed; } @@ -1219,17 +1441,12 @@ void SampleProfileLoader::buildEdges(Function &F) { } /// Returns the sorted CallTargetMap \p M by count in descending order. -static SmallVector<InstrProfValueData, 2> SortCallTargets( - const SampleRecord::CallTargetMap &M) { +static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets( + const SampleRecord::CallTargetMap & M) { SmallVector<InstrProfValueData, 2> R; - for (auto I = M.begin(); I != M.end(); ++I) - R.push_back({FunctionSamples::getGUID(I->getKey()), I->getValue()}); - llvm::sort(R, [](const InstrProfValueData &L, const InstrProfValueData &R) { - if (L.Count == R.Count) - return L.Value > R.Value; - else - return L.Count > R.Count; - }); + for (const auto &I : SampleRecord::SortCallTargets(M)) { + R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second}); + } return R; } @@ -1324,7 +1541,7 @@ void SampleProfileLoader::propagateWeights(Function &F) { if (!T || T.get().empty()) continue; SmallVector<InstrProfValueData, 2> SortedCallTargets = - SortCallTargets(T.get()); + GetSortedValueDataFromCallTargets(T.get()); uint64_t Sum; findIndirectCallFunctionSamples(I, Sum); annotateValueSite(*I.getParent()->getParent()->getParent(), I, @@ -1374,6 +1591,8 @@ void SampleProfileLoader::propagateWeights(Function &F) { } } + misexpect::verifyMisExpect(TI, Weights, TI->getContext()); + uint64_t TempWeight; // Only set weights if there is at least one non-zero weight. // In any other case, let the analyzer set weights. @@ -1555,32 +1774,58 @@ INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) +std::vector<Function *> +SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) { + std::vector<Function *> FunctionOrderList; + FunctionOrderList.reserve(M.size()); + + if (!ProfileTopDownLoad || CG == nullptr) { + for (Function &F : M) + if (!F.isDeclaration()) + FunctionOrderList.push_back(&F); + return FunctionOrderList; + } + + assert(&CG->getModule() == &M); + scc_iterator<CallGraph *> CGI = scc_begin(CG); + while (!CGI.isAtEnd()) { + for (CallGraphNode *node : *CGI) { + auto F = node->getFunction(); + if (F && !F->isDeclaration()) + FunctionOrderList.push_back(F); + } + ++CGI; + } + + std::reverse(FunctionOrderList.begin(), FunctionOrderList.end()); + return FunctionOrderList; +} + bool SampleProfileLoader::doInitialization(Module &M) { auto &Ctx = M.getContext(); - auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx); + + std::unique_ptr<SampleProfileReaderItaniumRemapper> RemapReader; + auto ReaderOrErr = + SampleProfileReader::create(Filename, Ctx, RemappingFilename); if (std::error_code EC = ReaderOrErr.getError()) { std::string Msg = "Could not open profile: " + EC.message(); Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); return false; } Reader = std::move(ReaderOrErr.get()); - Reader->collectFuncsToUse(M); + Reader->collectFuncsFrom(M); ProfileIsValid = (Reader->read() == sampleprof_error::success); - - if (!RemappingFilename.empty()) { - // Apply profile remappings to the loaded profile data if requested. - // For now, we only support remapping symbols encoded using the Itanium - // C++ ABI's name mangling scheme. - ReaderOrErr = SampleProfileReaderItaniumRemapper::create( - RemappingFilename, Ctx, std::move(Reader)); - if (std::error_code EC = ReaderOrErr.getError()) { - std::string Msg = "Could not open profile remapping file: " + EC.message(); - Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg)); - return false; - } - Reader = std::move(ReaderOrErr.get()); - ProfileIsValid = (Reader->read() == sampleprof_error::success); + PSL = Reader->getProfileSymbolList(); + + // While profile-sample-accurate is on, ignore symbol list. + ProfAccForSymsInList = + ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate; + if (ProfAccForSymsInList) { + NamesInProfile.clear(); + if (auto NameTable = Reader->getNameTable()) + NamesInProfile.insert(NameTable->begin(), NameTable->end()); } + return true; } @@ -1593,8 +1838,8 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) { } bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, - ProfileSummaryInfo *_PSI) { - FunctionSamples::GUIDToFuncNameMapper Mapper(M); + ProfileSummaryInfo *_PSI, CallGraph *CG) { + GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap); if (!ProfileIsValid) return false; @@ -1628,11 +1873,11 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, } bool retval = false; - for (auto &F : M) - if (!F.isDeclaration()) { - clearFunctionData(); - retval |= runOnFunction(F, AM); - } + for (auto F : buildFunctionOrder(M, CG)) { + assert(!F->isDeclaration()); + clearFunctionData(); + retval |= runOnFunction(*F, AM); + } // Account for cold calls not inlined.... for (const std::pair<Function *, NotInlinedProfileInfo> &pair : @@ -1647,23 +1892,52 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); ProfileSummaryInfo *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); - return SampleLoader.runOnModule(M, nullptr, PSI); + return SampleLoader.runOnModule(M, nullptr, PSI, nullptr); } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { - + DILocation2SampleMap.clear(); // By default the entry count is initialized to -1, which will be treated // conservatively by getEntryCount as the same as unknown (None). This is // to avoid newly added code to be treated as cold. If we have samples // this will be overwritten in emitAnnotations. - // If ProfileSampleAccurate is true or F has profile-sample-accurate - // attribute, initialize the entry count to 0 so callsites or functions - // unsampled will be treated as cold. - uint64_t initialEntryCount = - (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) - ? 0 - : -1; + uint64_t initialEntryCount = -1; + + ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL; + if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) { + // initialize all the function entry counts to 0. It means all the + // functions without profile will be regarded as cold. + initialEntryCount = 0; + // profile-sample-accurate is a user assertion which has a higher precedence + // than symbol list. When profile-sample-accurate is on, ignore symbol list. + ProfAccForSymsInList = false; + } + + // PSL -- profile symbol list include all the symbols in sampled binary. + // If ProfileAccurateForSymsInList is enabled, PSL is used to treat + // old functions without samples being cold, without having to worry + // about new and hot functions being mistakenly treated as cold. + if (ProfAccForSymsInList) { + // Initialize the entry count to 0 for functions in the list. + if (PSL->contains(F.getName())) + initialEntryCount = 0; + + // Function in the symbol list but without sample will be regarded as + // cold. To minimize the potential negative performance impact it could + // have, we want to be a little conservative here saying if a function + // shows up in the profile, no matter as outline function, inline instance + // or call targets, treat the function as not being cold. This will handle + // the cases such as most callsites of a function are inlined in sampled + // binary but not inlined in current build (because of source code drift, + // imprecise debug information, or the callsites are all cold individually + // but not cold accumulatively...), so the outline function showing up as + // cold in sampled binary will actually not be cold after current build. + StringRef CanonName = FunctionSamples::getCanonicalFnName(F); + if (NamesInProfile.count(CanonName)) + initialEntryCount = -1; + } + F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real)); std::unique_ptr<OptimizationRemarkEmitter> OwnedORE; if (AM) { @@ -1672,7 +1946,7 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) .getManager(); ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); } else { - OwnedORE = make_unique<OptimizationRemarkEmitter>(&F); + OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F); ORE = OwnedORE.get(); } Samples = Reader->getSamplesFor(F); @@ -1699,10 +1973,12 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M, : ProfileRemappingFileName, IsThinLTOPreLink, GetAssumptionCache, GetTTI); - SampleLoader.doInitialization(M); + if (!SampleLoader.doInitialization(M)) + return PreservedAnalyses::all(); ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); - if (!SampleLoader.runOnModule(M, &AM, PSI)) + CallGraph &CG = AM.getResult<CallGraphAnalysis>(M); + if (!SampleLoader.runOnModule(M, &AM, PSI, &CG)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp index 106db3c8bd9d..655a7a404951 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripDeadPrototypes.cpp @@ -16,6 +16,7 @@ #include "llvm/Transforms/IPO/StripDeadPrototypes.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp index 67a473612fc1..6ce00714523b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/StripSymbols.cpp @@ -20,7 +20,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" @@ -28,8 +27,10 @@ #include "llvm/IR/Module.h" #include "llvm/IR/TypeFinder.h" #include "llvm/IR/ValueSymbolTable.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; namespace { diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 24c476376c14..87a18171787f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Object/ModuleSymbolTable.h" #include "llvm/Pass.h" #include "llvm/Support/ScopedPrinter.h" @@ -24,6 +25,7 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/Transforms/IPO/FunctionImport.h" +#include "llvm/Transforms/IPO/LowerTypeTests.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ModuleUtils.h" using namespace llvm; @@ -218,10 +220,18 @@ void splitAndWriteThinLTOBitcode( promoteTypeIds(M, ModuleId); - // Returns whether a global has attached type metadata. Such globals may - // participate in CFI or whole-program devirtualization, so they need to - // appear in the merged module instead of the thin LTO module. + // Returns whether a global or its associated global has attached type + // metadata. The former may participate in CFI or whole-program + // devirtualization, so they need to appear in the merged module instead of + // the thin LTO module. Similarly, globals that are associated with globals + // with type metadata need to appear in the merged module because they will + // reference the global's section directly. auto HasTypeMetadata = [](const GlobalObject *GO) { + if (MDNode *MD = GO->getMetadata(LLVMContext::MD_associated)) + if (auto *AssocVM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(0))) + if (auto *AssocGO = dyn_cast<GlobalObject>(AssocVM->getValue())) + if (AssocGO->hasMetadata(LLVMContext::MD_type)) + return true; return GO->hasMetadata(LLVMContext::MD_type); }; @@ -315,9 +325,9 @@ void splitAndWriteThinLTOBitcode( SmallVector<Metadata *, 4> Elts; Elts.push_back(MDString::get(Ctx, F.getName())); CfiFunctionLinkage Linkage; - if (!F.isDeclarationForLinker()) + if (lowertypetests::isJumpTableCanonical(&F)) Linkage = CFL_Definition; - else if (F.isWeakForLinker()) + else if (F.hasExternalWeakLinkage()) Linkage = CFL_WeakDeclaration; else Linkage = CFL_Declaration; @@ -457,7 +467,7 @@ void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS, // splitAndWriteThinLTOBitcode). Just always build it once via the // buildModuleSummaryIndex when Module(s) are ready. ProfileSummaryInfo PSI(M); - NewIndex = llvm::make_unique<ModuleSummaryIndex>( + NewIndex = std::make_unique<ModuleSummaryIndex>( buildModuleSummaryIndex(M, nullptr, &PSI)); Index = NewIndex.get(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 6b6dd6194e17..5ccfb29b01a1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -24,12 +24,14 @@ // returns 0, or a single vtable's function returns 1, replace each virtual // call with a comparison of the vptr against that vtable's address. // -// This pass is intended to be used during the regular and thin LTO pipelines. +// This pass is intended to be used during the regular and thin LTO pipelines: +// // During regular LTO, the pass determines the best optimization for each // virtual call and applies the resolutions directly to virtual calls that are // eligible for virtual call optimization (i.e. calls that use either of the -// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During -// ThinLTO, the pass operates in two phases: +// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). +// +// During hybrid Regular/ThinLTO, the pass operates in two phases: // - Export phase: this is run during the thin link over a single merged module // that contains all vtables with !type metadata that participate in the link. // The pass computes a resolution for each virtual call and stores it in the @@ -38,6 +40,14 @@ // modules. The pass applies the resolutions previously computed during the // import phase to each eligible virtual call. // +// During ThinLTO, the pass operates in two phases: +// - Export phase: this is run during the thin link over the index which +// contains a summary of all vtables with !type metadata that participate in +// the link. It computes a resolution for each virtual call and stores it in +// the type identifier summary. Only single implementation devirtualization +// is supported. +// - Import phase: (same as with hybrid case above). +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/IPO/WholeProgramDevirt.h" @@ -70,10 +80,12 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSummaryIndexYAML.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MathExtras.h" @@ -117,6 +129,11 @@ static cl::opt<unsigned> cl::desc("Maximum number of call targets per " "call site to enable branch funnels")); +static cl::opt<bool> + PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden, + cl::init(false), cl::ZeroOrMore, + cl::desc("Print index-based devirtualization messages")); + // Find the minimum offset that we may store a value of size Size bits at. If // IsAfter is set, look for an offset before the object, otherwise look for an // offset after the object. @@ -265,6 +282,25 @@ template <> struct DenseMapInfo<VTableSlot> { } }; +template <> struct DenseMapInfo<VTableSlotSummary> { + static VTableSlotSummary getEmptyKey() { + return {DenseMapInfo<StringRef>::getEmptyKey(), + DenseMapInfo<uint64_t>::getEmptyKey()}; + } + static VTableSlotSummary getTombstoneKey() { + return {DenseMapInfo<StringRef>::getTombstoneKey(), + DenseMapInfo<uint64_t>::getTombstoneKey()}; + } + static unsigned getHashValue(const VTableSlotSummary &I) { + return DenseMapInfo<StringRef>::getHashValue(I.TypeID) ^ + DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset); + } + static bool isEqual(const VTableSlotSummary &LHS, + const VTableSlotSummary &RHS) { + return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset; + } +}; + } // end namespace llvm namespace { @@ -342,19 +378,21 @@ struct CallSiteInfo { /// pass the vector is non-empty, we will need to add a use of llvm.type.test /// to each of the function summaries in the vector. std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers; + std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers; bool isExported() const { return SummaryHasTypeTestAssumeUsers || !SummaryTypeCheckedLoadUsers.empty(); } - void markSummaryHasTypeTestAssumeUsers() { - SummaryHasTypeTestAssumeUsers = true; + void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) { + SummaryTypeCheckedLoadUsers.push_back(FS); AllCallSitesDevirted = false; } - void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) { - SummaryTypeCheckedLoadUsers.push_back(FS); + void addSummaryTypeTestAssumeUser(FunctionSummary *FS) { + SummaryTypeTestAssumeUsers.push_back(FS); + SummaryHasTypeTestAssumeUsers = true; AllCallSitesDevirted = false; } @@ -450,13 +488,12 @@ struct DevirtModule { bool areRemarksEnabled(); - void scanTypeTestUsers(Function *TypeTestFunc, Function *AssumeFunc); + void scanTypeTestUsers(Function *TypeTestFunc); void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc); void buildTypeIdentifierMap( std::vector<VTableBits> &Bits, DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap); - Constant *getPointerAtOffset(Constant *I, uint64_t Offset); bool tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot, const std::set<TypeMemberInfo> &TypeMemberInfos, @@ -464,7 +501,8 @@ struct DevirtModule { void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn, bool &IsExported); - bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot, + bool trySingleImplDevirt(ModuleSummaryIndex *ExportSummary, + MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res); @@ -542,13 +580,45 @@ struct DevirtModule { function_ref<DominatorTree &(Function &)> LookupDomTree); }; +struct DevirtIndex { + ModuleSummaryIndex &ExportSummary; + // The set in which to record GUIDs exported from their module by + // devirtualization, used by client to ensure they are not internalized. + std::set<GlobalValue::GUID> &ExportedGUIDs; + // A map in which to record the information necessary to locate the WPD + // resolution for local targets in case they are exported by cross module + // importing. + std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap; + + MapVector<VTableSlotSummary, VTableSlotInfo> CallSlots; + + DevirtIndex( + ModuleSummaryIndex &ExportSummary, + std::set<GlobalValue::GUID> &ExportedGUIDs, + std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) + : ExportSummary(ExportSummary), ExportedGUIDs(ExportedGUIDs), + LocalWPDTargetsMap(LocalWPDTargetsMap) {} + + bool tryFindVirtualCallTargets(std::vector<ValueInfo> &TargetsForSlot, + const TypeIdCompatibleVtableInfo TIdInfo, + uint64_t ByteOffset); + + bool trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot, + VTableSlotSummary &SlotSummary, + VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res, + std::set<ValueInfo> &DevirtTargets); + + void run(); +}; + struct WholeProgramDevirt : public ModulePass { static char ID; bool UseCommandLine = false; - ModuleSummaryIndex *ExportSummary; - const ModuleSummaryIndex *ImportSummary; + ModuleSummaryIndex *ExportSummary = nullptr; + const ModuleSummaryIndex *ImportSummary = nullptr; WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) { initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry()); @@ -572,7 +642,7 @@ struct WholeProgramDevirt : public ModulePass { // an optimization remark emitter on the fly, when we need it. std::unique_ptr<OptimizationRemarkEmitter> ORE; auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { - ORE = make_unique<OptimizationRemarkEmitter>(F); + ORE = std::make_unique<OptimizationRemarkEmitter>(F); return *ORE; }; @@ -632,6 +702,41 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M, return PreservedAnalyses::none(); } +namespace llvm { +void runWholeProgramDevirtOnIndex( + ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs, + std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) { + DevirtIndex(Summary, ExportedGUIDs, LocalWPDTargetsMap).run(); +} + +void updateIndexWPDForExports( + ModuleSummaryIndex &Summary, + function_ref<bool(StringRef, ValueInfo)> isExported, + std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) { + for (auto &T : LocalWPDTargetsMap) { + auto &VI = T.first; + // This was enforced earlier during trySingleImplDevirt. + assert(VI.getSummaryList().size() == 1 && + "Devirt of local target has more than one copy"); + auto &S = VI.getSummaryList()[0]; + if (!isExported(S->modulePath(), VI)) + continue; + + // It's been exported by a cross module import. + for (auto &SlotSummary : T.second) { + auto *TIdSum = Summary.getTypeIdSummary(SlotSummary.TypeID); + assert(TIdSum); + auto WPDRes = TIdSum->WPDRes.find(SlotSummary.ByteOffset); + assert(WPDRes != TIdSum->WPDRes.end()); + WPDRes->second.SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal( + WPDRes->second.SingleImplName, + Summary.getModuleHash(S->modulePath())); + } + } +} + +} // end namespace llvm + bool DevirtModule::runForTesting( Module &M, function_ref<AAResults &(Function &)> AARGetter, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter, @@ -662,7 +767,7 @@ bool DevirtModule::runForTesting( ExitOnError ExitOnErr( "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": "); std::error_code EC; - raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text); + raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text); ExitOnErr(errorCodeToError(EC)); yaml::Output Out(OS); @@ -706,38 +811,6 @@ void DevirtModule::buildTypeIdentifierMap( } } -Constant *DevirtModule::getPointerAtOffset(Constant *I, uint64_t Offset) { - if (I->getType()->isPointerTy()) { - if (Offset == 0) - return I; - return nullptr; - } - - const DataLayout &DL = M.getDataLayout(); - - if (auto *C = dyn_cast<ConstantStruct>(I)) { - const StructLayout *SL = DL.getStructLayout(C->getType()); - if (Offset >= SL->getSizeInBytes()) - return nullptr; - - unsigned Op = SL->getElementContainingOffset(Offset); - return getPointerAtOffset(cast<Constant>(I->getOperand(Op)), - Offset - SL->getElementOffset(Op)); - } - if (auto *C = dyn_cast<ConstantArray>(I)) { - ArrayType *VTableTy = C->getType(); - uint64_t ElemSize = DL.getTypeAllocSize(VTableTy->getElementType()); - - unsigned Op = Offset / ElemSize; - if (Op >= C->getNumOperands()) - return nullptr; - - return getPointerAtOffset(cast<Constant>(I->getOperand(Op)), - Offset % ElemSize); - } - return nullptr; -} - bool DevirtModule::tryFindVirtualCallTargets( std::vector<VirtualCallTarget> &TargetsForSlot, const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) { @@ -746,7 +819,7 @@ bool DevirtModule::tryFindVirtualCallTargets( return false; Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(), - TM.Offset + ByteOffset); + TM.Offset + ByteOffset, M); if (!Ptr) return false; @@ -766,6 +839,47 @@ bool DevirtModule::tryFindVirtualCallTargets( return !TargetsForSlot.empty(); } +bool DevirtIndex::tryFindVirtualCallTargets( + std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo, + uint64_t ByteOffset) { + for (const TypeIdOffsetVtableInfo &P : TIdInfo) { + // Find the first non-available_externally linkage vtable initializer. + // We can have multiple available_externally, linkonce_odr and weak_odr + // vtable initializers, however we want to skip available_externally as they + // do not have type metadata attached, and therefore the summary will not + // contain any vtable functions. We can also have multiple external + // vtable initializers in the case of comdats, which we cannot check here. + // The linker should give an error in this case. + // + // Also, handle the case of same-named local Vtables with the same path + // and therefore the same GUID. This can happen if there isn't enough + // distinguishing path when compiling the source file. In that case we + // conservatively return false early. + const GlobalVarSummary *VS = nullptr; + bool LocalFound = false; + for (auto &S : P.VTableVI.getSummaryList()) { + if (GlobalValue::isLocalLinkage(S->linkage())) { + if (LocalFound) + return false; + LocalFound = true; + } + if (!GlobalValue::isAvailableExternallyLinkage(S->linkage())) + VS = cast<GlobalVarSummary>(S->getBaseObject()); + } + if (!VS->isLive()) + continue; + for (auto VTP : VS->vTableFuncs()) { + if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset) + continue; + + TargetsForSlot.push_back(VTP.FuncVI); + } + } + + // Give up if we couldn't find any targets. + return !TargetsForSlot.empty(); +} + void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn, bool &IsExported) { auto Apply = [&](CallSiteInfo &CSInfo) { @@ -788,9 +902,38 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, Apply(P.second); } +static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) { + // We can't add calls if we haven't seen a definition + if (Callee.getSummaryList().empty()) + return false; + + // Insert calls into the summary index so that the devirtualized targets + // are eligible for import. + // FIXME: Annotate type tests with hotness. For now, mark these as hot + // to better ensure we have the opportunity to inline them. + bool IsExported = false; + auto &S = Callee.getSummaryList()[0]; + CalleeInfo CI(CalleeInfo::HotnessType::Hot, /* RelBF = */ 0); + auto AddCalls = [&](CallSiteInfo &CSInfo) { + for (auto *FS : CSInfo.SummaryTypeCheckedLoadUsers) { + FS->addCall({Callee, CI}); + IsExported |= S->modulePath() != FS->modulePath(); + } + for (auto *FS : CSInfo.SummaryTypeTestAssumeUsers) { + FS->addCall({Callee, CI}); + IsExported |= S->modulePath() != FS->modulePath(); + } + }; + AddCalls(SlotInfo.CSInfo); + for (auto &P : SlotInfo.ConstCSInfo) + AddCalls(P.second); + return IsExported; +} + bool DevirtModule::trySingleImplDevirt( - MutableArrayRef<VirtualCallTarget> TargetsForSlot, - VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) { + ModuleSummaryIndex *ExportSummary, + MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res) { // See if the program contains a single implementation of this virtual // function. Function *TheFn = TargetsForSlot[0].Fn; @@ -830,6 +973,10 @@ bool DevirtModule::trySingleImplDevirt( TheFn->setVisibility(GlobalValue::HiddenVisibility); TheFn->setName(NewName); } + if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID())) + // Any needed promotion of 'TheFn' has already been done during + // LTO unit split, so we can ignore return value of AddCalls. + AddCalls(SlotInfo, TheFnVI); Res->TheKind = WholeProgramDevirtResolution::SingleImpl; Res->SingleImplName = TheFn->getName(); @@ -837,6 +984,63 @@ bool DevirtModule::trySingleImplDevirt( return true; } +bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot, + VTableSlotSummary &SlotSummary, + VTableSlotInfo &SlotInfo, + WholeProgramDevirtResolution *Res, + std::set<ValueInfo> &DevirtTargets) { + // See if the program contains a single implementation of this virtual + // function. + auto TheFn = TargetsForSlot[0]; + for (auto &&Target : TargetsForSlot) + if (TheFn != Target) + return false; + + // Don't devirtualize if we don't have target definition. + auto Size = TheFn.getSummaryList().size(); + if (!Size) + return false; + + // If the summary list contains multiple summaries where at least one is + // a local, give up, as we won't know which (possibly promoted) name to use. + for (auto &S : TheFn.getSummaryList()) + if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1) + return false; + + // Collect functions devirtualized at least for one call site for stats. + if (PrintSummaryDevirt) + DevirtTargets.insert(TheFn); + + auto &S = TheFn.getSummaryList()[0]; + bool IsExported = AddCalls(SlotInfo, TheFn); + if (IsExported) + ExportedGUIDs.insert(TheFn.getGUID()); + + // Record in summary for use in devirtualization during the ThinLTO import + // step. + Res->TheKind = WholeProgramDevirtResolution::SingleImpl; + if (GlobalValue::isLocalLinkage(S->linkage())) { + if (IsExported) + // If target is a local function and we are exporting it by + // devirtualizing a call in another module, we need to record the + // promoted name. + Res->SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal( + TheFn.name(), ExportSummary.getModuleHash(S->modulePath())); + else { + LocalWPDTargetsMap[TheFn].push_back(SlotSummary); + Res->SingleImplName = TheFn.name(); + } + } else + Res->SingleImplName = TheFn.name(); + + // Name will be empty if this thin link driven off of serialized combined + // index (e.g. llvm-lto). However, WPD is not supported/invoked for the + // legacy LTO API anyway. + assert(!Res->SingleImplName.empty()); + + return true; +} + void DevirtModule::tryICallBranchFunnel( MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res, VTableSlot Slot) { @@ -1044,8 +1248,7 @@ std::string DevirtModule::getGlobalName(VTableSlot Slot, bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() { Triple T(M.getTargetTriple()); - return (T.getArch() == Triple::x86 || T.getArch() == Triple::x86_64) && - T.getObjectFormat() == Triple::ELF; + return T.isX86() && T.getObjectFormat() == Triple::ELF; } void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, @@ -1302,10 +1505,13 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { if (B.Before.Bytes.empty() && B.After.Bytes.empty()) return; - // Align each byte array to pointer width. - unsigned PointerSize = M.getDataLayout().getPointerSize(); - B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), PointerSize)); - B.After.Bytes.resize(alignTo(B.After.Bytes.size(), PointerSize)); + // Align the before byte array to the global's minimum alignment so that we + // don't break any alignment requirements on the global. + MaybeAlign Alignment(B.GV->getAlignment()); + if (!Alignment) + Alignment = + Align(M.getDataLayout().getABITypeAlignment(B.GV->getValueType())); + B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), Alignment)); // Before was stored in reverse order; flip it now. for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I) @@ -1322,6 +1528,7 @@ void DevirtModule::rebuildGlobal(VTableBits &B) { GlobalVariable::PrivateLinkage, NewInit, "", B.GV); NewGV->setSection(B.GV->getSection()); NewGV->setComdat(B.GV->getComdat()); + NewGV->setAlignment(MaybeAlign(B.GV->getAlignment())); // Copy the original vtable's metadata to the anonymous global, adjusting // offsets as required. @@ -1355,8 +1562,7 @@ bool DevirtModule::areRemarksEnabled() { return false; } -void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc, - Function *AssumeFunc) { +void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc) { // Find all virtual calls via a virtual table pointer %p under an assumption // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p // points to a member of the type identifier %md. Group calls by (type ID, @@ -1483,8 +1689,11 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { } void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { + auto *TypeId = dyn_cast<MDString>(Slot.TypeID); + if (!TypeId) + return; const TypeIdSummary *TidSummary = - ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString()); + ImportSummary->getTypeIdSummary(TypeId->getString()); if (!TidSummary) return; auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset); @@ -1493,6 +1702,7 @@ void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) { const WholeProgramDevirtResolution &Res = ResI->second; if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) { + assert(!Res.SingleImplName.empty()); // The type of the function in the declaration is irrelevant because every // call site will cast it to the correct type. Constant *SingleImpl = @@ -1587,7 +1797,7 @@ bool DevirtModule::run() { return false; if (TypeTestFunc && AssumeFunc) - scanTypeTestUsers(TypeTestFunc, AssumeFunc); + scanTypeTestUsers(TypeTestFunc); if (TypeCheckedLoadFunc) scanTypeCheckedLoadUsers(TypeCheckedLoadFunc); @@ -1627,8 +1837,7 @@ bool DevirtModule::run() { // FIXME: Only add live functions. for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) { for (Metadata *MD : MetadataByGUID[VF.GUID]) { - CallSlots[{MD, VF.Offset}] - .CSInfo.markSummaryHasTypeTestAssumeUsers(); + CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS); } } for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) { @@ -1641,7 +1850,7 @@ bool DevirtModule::run() { for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) { CallSlots[{MD, VC.VFunc.Offset}] .ConstCSInfo[VC.Args] - .markSummaryHasTypeTestAssumeUsers(); + .addSummaryTypeTestAssumeUser(FS); } } for (const FunctionSummary::ConstVCall &VC : @@ -1673,7 +1882,7 @@ bool DevirtModule::run() { cast<MDString>(S.first.TypeID)->getString()) .WPDRes[S.first.ByteOffset]; - if (!trySingleImplDevirt(TargetsForSlot, S.second, Res)) { + if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) { DidVirtualConstProp |= tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first); @@ -1710,7 +1919,7 @@ bool DevirtModule::run() { using namespace ore; OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F) << "devirtualized " - << NV("FunctionName", F->getName())); + << NV("FunctionName", DT.first)); } } @@ -1722,5 +1931,86 @@ bool DevirtModule::run() { for (VTableBits &B : Bits) rebuildGlobal(B); + // We have lowered or deleted the type checked load intrinsics, so we no + // longer have enough information to reason about the liveness of virtual + // function pointers in GlobalDCE. + for (GlobalVariable &GV : M.globals()) + GV.eraseMetadata(LLVMContext::MD_vcall_visibility); + return true; } + +void DevirtIndex::run() { + if (ExportSummary.typeIdCompatibleVtableMap().empty()) + return; + + DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID; + for (auto &P : ExportSummary.typeIdCompatibleVtableMap()) { + NameByGUID[GlobalValue::getGUID(P.first)].push_back(P.first); + } + + // Collect information from summary about which calls to try to devirtualize. + for (auto &P : ExportSummary) { + for (auto &S : P.second.SummaryList) { + auto *FS = dyn_cast<FunctionSummary>(S.get()); + if (!FS) + continue; + // FIXME: Only add live functions. + for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) { + for (StringRef Name : NameByGUID[VF.GUID]) { + CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS); + } + } + for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) { + for (StringRef Name : NameByGUID[VF.GUID]) { + CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS); + } + } + for (const FunctionSummary::ConstVCall &VC : + FS->type_test_assume_const_vcalls()) { + for (StringRef Name : NameByGUID[VC.VFunc.GUID]) { + CallSlots[{Name, VC.VFunc.Offset}] + .ConstCSInfo[VC.Args] + .addSummaryTypeTestAssumeUser(FS); + } + } + for (const FunctionSummary::ConstVCall &VC : + FS->type_checked_load_const_vcalls()) { + for (StringRef Name : NameByGUID[VC.VFunc.GUID]) { + CallSlots[{Name, VC.VFunc.Offset}] + .ConstCSInfo[VC.Args] + .addSummaryTypeCheckedLoadUser(FS); + } + } + } + } + + std::set<ValueInfo> DevirtTargets; + // For each (type, offset) pair: + for (auto &S : CallSlots) { + // Search each of the members of the type identifier for the virtual + // function implementation at offset S.first.ByteOffset, and add to + // TargetsForSlot. + std::vector<ValueInfo> TargetsForSlot; + auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID); + assert(TidSummary); + if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary, + S.first.ByteOffset)) { + WholeProgramDevirtResolution *Res = + &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID) + .WPDRes[S.first.ByteOffset]; + + if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res, + DevirtTargets)) + continue; + } + } + + // Optionally have the thin link print message for each devirtualized + // function. + if (PrintSummaryDevirt) + for (const auto &DT : DevirtTargets) + errs() << "Devirtualized call to " << DT << "\n"; + + return; +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index ba15b023f2a3..ec976a971e3c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -890,6 +890,10 @@ Instruction *InstCombiner::foldAddWithConstant(BinaryOperator &Add) { if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->getScalarSizeInBits() == 1) return SelectInst::Create(X, AddOne(Op1C), Op1); + // sext(bool) + C -> bool ? C - 1 : C + if (match(Op0, m_SExt(m_Value(X))) && + X->getType()->getScalarSizeInBits() == 1) + return SelectInst::Create(X, SubOne(Op1C), Op1); // ~X + C --> (C-1) - X if (match(Op0, m_Not(m_Value(X)))) @@ -1097,6 +1101,107 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) { return nullptr; } +Instruction * +InstCombiner::canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract( + BinaryOperator &I) { + assert((I.getOpcode() == Instruction::Add || + I.getOpcode() == Instruction::Or || + I.getOpcode() == Instruction::Sub) && + "Expecting add/or/sub instruction"); + + // We have a subtraction/addition between a (potentially truncated) *logical* + // right-shift of X and a "select". + Value *X, *Select; + Instruction *LowBitsToSkip, *Extract; + if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd( + m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)), + m_Instruction(Extract))), + m_Value(Select)))) + return nullptr; + + // `add`/`or` is commutative; but for `sub`, "select" *must* be on RHS. + if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select) + return nullptr; + + Type *XTy = X->getType(); + bool HadTrunc = I.getType() != XTy; + + // If there was a truncation of extracted value, then we'll need to produce + // one extra instruction, so we need to ensure one instruction will go away. + if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + + // Extraction should extract high NBits bits, with shift amount calculated as: + // low bits to skip = shift bitwidth - high bits to extract + // The shift amount itself may be extended, and we need to look past zero-ext + // when matching NBits, that will matter for matching later. + Constant *C; + Value *NBits; + if (!match( + LowBitsToSkip, + m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) || + !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(C->getType()->getScalarSizeInBits(), + X->getType()->getScalarSizeInBits())))) + return nullptr; + + // Sign-extending value can be zero-extended if we `sub`tract it, + // or sign-extended otherwise. + auto SkipExtInMagic = [&I](Value *&V) { + if (I.getOpcode() == Instruction::Sub) + match(V, m_ZExtOrSelf(m_Value(V))); + else + match(V, m_SExtOrSelf(m_Value(V))); + }; + + // Now, finally validate the sign-extending magic. + // `select` itself may be appropriately extended, look past that. + SkipExtInMagic(Select); + + ICmpInst::Predicate Pred; + const APInt *Thr; + Value *SignExtendingValue, *Zero; + bool ShouldSignext; + // It must be a select between two values we will later establish to be a + // sign-extending value and a zero constant. The condition guarding the + // sign-extension must be based on a sign bit of the same X we had in `lshr`. + if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)), + m_Value(SignExtendingValue), m_Value(Zero))) || + !isSignBitCheck(Pred, *Thr, ShouldSignext)) + return nullptr; + + // icmp-select pair is commutative. + if (!ShouldSignext) + std::swap(SignExtendingValue, Zero); + + // If we should not perform sign-extension then we must add/or/subtract zero. + if (!match(Zero, m_Zero())) + return nullptr; + // Otherwise, it should be some constant, left-shifted by the same NBits we + // had in `lshr`. Said left-shift can also be appropriately extended. + // Again, we must look past zero-ext when looking for NBits. + SkipExtInMagic(SignExtendingValue); + Constant *SignExtendingValueBaseConstant; + if (!match(SignExtendingValue, + m_Shl(m_Constant(SignExtendingValueBaseConstant), + m_ZExtOrSelf(m_Specific(NBits))))) + return nullptr; + // If we `sub`, then the constant should be one, else it should be all-ones. + if (I.getOpcode() == Instruction::Sub + ? !match(SignExtendingValueBaseConstant, m_One()) + : !match(SignExtendingValueBaseConstant, m_AllOnes())) + return nullptr; + + auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip, + Extract->getName() + ".sext"); + NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness. + if (!HadTrunc) + return NewAShr; + + Builder.Insert(NewAShr); + return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType()); +} + Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1), I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), @@ -1187,12 +1292,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { return BinaryOperator::CreateSub(RHS, A); } - // Canonicalize sext to zext for better value tracking potential. - // add A, sext(B) --> sub A, zext(B) - if (match(&I, m_c_Add(m_Value(A), m_OneUse(m_SExt(m_Value(B))))) && - B->getType()->isIntOrIntVectorTy(1)) - return BinaryOperator::CreateSub(A, Builder.CreateZExt(B, Ty)); - // A + -B --> A - B if (match(RHS, m_Neg(m_Value(B)))) return BinaryOperator::CreateSub(LHS, B); @@ -1302,12 +1401,32 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (Instruction *V = canonicalizeLowbitMask(I, Builder)) return V; + if (Instruction *V = + canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) + return V; + if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I)) return SatAdd; return Changed ? &I : nullptr; } +/// Eliminate an op from a linear interpolation (lerp) pattern. +static Instruction *factorizeLerp(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *X, *Y, *Z; + if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y), + m_OneUse(m_FSub(m_FPOne(), + m_Value(Z))))), + m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z)))))) + return nullptr; + + // (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants] + Value *XY = Builder.CreateFSubFMF(X, Y, &I); + Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I); + return BinaryOperator::CreateFAddFMF(Y, MulZ, &I); +} + /// Factor a common operand out of fadd/fsub of fmul/fdiv. static Instruction *factorizeFAddFSub(BinaryOperator &I, InstCombiner::BuilderTy &Builder) { @@ -1315,6 +1434,10 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I, I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub"); assert(I.hasAllowReassoc() && I.hasNoSignedZeros() && "FP factorization requires FMF"); + + if (Instruction *Lerp = factorizeLerp(I, Builder)) + return Lerp; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); Value *X, *Y, *Z; bool IsFMul; @@ -1362,17 +1485,32 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I)) return FoldedFAdd; - Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); - Value *X; // (-X) + Y --> Y - X - if (match(LHS, m_FNeg(m_Value(X)))) - return BinaryOperator::CreateFSubFMF(RHS, X, &I); - // Y + (-X) --> Y - X - if (match(RHS, m_FNeg(m_Value(X)))) - return BinaryOperator::CreateFSubFMF(LHS, X, &I); + Value *X, *Y; + if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y)))) + return BinaryOperator::CreateFSubFMF(Y, X, &I); + + // Similar to above, but look through fmul/fdiv for the negated term. + // (-X * Y) + Z --> Z - (X * Y) [4 commuted variants] + Value *Z; + if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))), + m_Value(Z)))) { + Value *XY = Builder.CreateFMulFMF(X, Y, &I); + return BinaryOperator::CreateFSubFMF(Z, XY, &I); + } + // (-X / Y) + Z --> Z - (X / Y) [2 commuted variants] + // (X / -Y) + Z --> Z - (X / Y) [2 commuted variants] + if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))), + m_Value(Z))) || + match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))), + m_Value(Z)))) { + Value *XY = Builder.CreateFDivFMF(X, Y, &I); + return BinaryOperator::CreateFSubFMF(Z, XY, &I); + } // Check for (fadd double (sitofp x), y), see if we can merge this into an // integer add followed by a promotion. + Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) { Value *LHSIntVal = LHSConv->getOperand(0); Type *FPType = LHSConv->getType(); @@ -1447,7 +1585,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { /// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, - Type *Ty) { + Type *Ty, bool IsNUW) { // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize // this. bool Swapped = false; @@ -1515,6 +1653,15 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS, // Emit the offset of the GEP and an intptr_t. Value *Result = EmitGEPOffset(GEP1); + // If this is a single inbounds GEP and the original sub was nuw, + // then the final multiplication is also nuw. We match an extra add zero + // here, because that's what EmitGEPOffset() generates. + Instruction *I; + if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() && + match(Result, m_Add(m_Instruction(I), m_Zero())) && + I->getOpcode() == Instruction::Mul) + I->setHasNoUnsignedWrap(); + // If we had a constant expression GEP on the other side offsetting the // pointer, subtract it from the offset we have. if (GEP2) { @@ -1631,37 +1778,50 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { const APInt *Op0C; if (match(Op0, m_APInt(Op0C))) { - unsigned BitWidth = I.getType()->getScalarSizeInBits(); - // -(X >>u 31) -> (X >>s 31) - // -(X >>s 31) -> (X >>u 31) if (Op0C->isNullValue()) { + Value *Op1Wide; + match(Op1, m_TruncOrSelf(m_Value(Op1Wide))); + bool HadTrunc = Op1Wide != Op1; + bool NoTruncOrTruncIsOneUse = !HadTrunc || Op1->hasOneUse(); + unsigned BitWidth = Op1Wide->getType()->getScalarSizeInBits(); + Value *X; const APInt *ShAmt; - if (match(Op1, m_LShr(m_Value(X), m_APInt(ShAmt))) && + // -(X >>u 31) -> (X >>s 31) + if (NoTruncOrTruncIsOneUse && + match(Op1Wide, m_LShr(m_Value(X), m_APInt(ShAmt))) && *ShAmt == BitWidth - 1) { - Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1); - return BinaryOperator::CreateAShr(X, ShAmtOp); + Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1); + Instruction *NewShift = BinaryOperator::CreateAShr(X, ShAmtOp); + NewShift->copyIRFlags(Op1Wide); + if (!HadTrunc) + return NewShift; + Builder.Insert(NewShift); + return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType()); } - if (match(Op1, m_AShr(m_Value(X), m_APInt(ShAmt))) && + // -(X >>s 31) -> (X >>u 31) + if (NoTruncOrTruncIsOneUse && + match(Op1Wide, m_AShr(m_Value(X), m_APInt(ShAmt))) && *ShAmt == BitWidth - 1) { - Value *ShAmtOp = cast<Instruction>(Op1)->getOperand(1); - return BinaryOperator::CreateLShr(X, ShAmtOp); + Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1); + Instruction *NewShift = BinaryOperator::CreateLShr(X, ShAmtOp); + NewShift->copyIRFlags(Op1Wide); + if (!HadTrunc) + return NewShift; + Builder.Insert(NewShift); + return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType()); } - if (Op1->hasOneUse()) { + if (!HadTrunc && Op1->hasOneUse()) { Value *LHS, *RHS; SelectPatternFlavor SPF = matchSelectPattern(Op1, LHS, RHS).Flavor; if (SPF == SPF_ABS || SPF == SPF_NABS) { // This is a negate of an ABS/NABS pattern. Just swap the operands // of the select. - SelectInst *SI = cast<SelectInst>(Op1); - Value *TrueVal = SI->getTrueValue(); - Value *FalseVal = SI->getFalseValue(); - SI->setTrueValue(FalseVal); - SI->setFalseValue(TrueVal); + cast<SelectInst>(Op1)->swapValues(); // Don't swap prof metadata, we didn't change the branch behavior. - return replaceInstUsesWith(I, SI); + return replaceInstUsesWith(I, Op1); } } } @@ -1686,6 +1846,23 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return BinaryOperator::CreateNeg(Y); } + // (sub (or A, B) (and A, B)) --> (xor A, B) + { + Value *A, *B; + if (match(Op1, m_And(m_Value(A), m_Value(B))) && + match(Op0, m_c_Or(m_Specific(A), m_Specific(B)))) + return BinaryOperator::CreateXor(A, B); + } + + // (sub (and A, B) (or A, B)) --> neg (xor A, B) + { + Value *A, *B; + if (match(Op0, m_And(m_Value(A), m_Value(B))) && + match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) && + (Op0->hasOneUse() || Op1->hasOneUse())) + return BinaryOperator::CreateNeg(Builder.CreateXor(A, B)); + } + // (sub (or A, B), (xor A, B)) --> (and A, B) { Value *A, *B; @@ -1694,6 +1871,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return BinaryOperator::CreateAnd(A, B); } + // (sub (xor A, B) (or A, B)) --> neg (and A, B) + { + Value *A, *B; + if (match(Op0, m_Xor(m_Value(A), m_Value(B))) && + match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) && + (Op0->hasOneUse() || Op1->hasOneUse())) + return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B)); + } + { Value *Y; // ((X | Y) - X) --> (~X & Y) @@ -1702,6 +1888,74 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { Y, Builder.CreateNot(Op1, Op1->getName() + ".not")); } + { + // (sub (and Op1, (neg X)), Op1) --> neg (and Op1, (add X, -1)) + Value *X; + if (match(Op0, m_OneUse(m_c_And(m_Specific(Op1), + m_OneUse(m_Neg(m_Value(X))))))) { + return BinaryOperator::CreateNeg(Builder.CreateAnd( + Op1, Builder.CreateAdd(X, Constant::getAllOnesValue(I.getType())))); + } + } + + { + // (sub (and Op1, C), Op1) --> neg (and Op1, ~C) + Constant *C; + if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_Constant(C))))) { + return BinaryOperator::CreateNeg( + Builder.CreateAnd(Op1, Builder.CreateNot(C))); + } + } + + { + // If we have a subtraction between some value and a select between + // said value and something else, sink subtraction into select hands, i.e.: + // sub (select %Cond, %TrueVal, %FalseVal), %Op1 + // -> + // select %Cond, (sub %TrueVal, %Op1), (sub %FalseVal, %Op1) + // or + // sub %Op0, (select %Cond, %TrueVal, %FalseVal) + // -> + // select %Cond, (sub %Op0, %TrueVal), (sub %Op0, %FalseVal) + // This will result in select between new subtraction and 0. + auto SinkSubIntoSelect = + [Ty = I.getType()](Value *Select, Value *OtherHandOfSub, + auto SubBuilder) -> Instruction * { + Value *Cond, *TrueVal, *FalseVal; + if (!match(Select, m_OneUse(m_Select(m_Value(Cond), m_Value(TrueVal), + m_Value(FalseVal))))) + return nullptr; + if (OtherHandOfSub != TrueVal && OtherHandOfSub != FalseVal) + return nullptr; + // While it is really tempting to just create two subtractions and let + // InstCombine fold one of those to 0, it isn't possible to do so + // because of worklist visitation order. So ugly it is. + bool OtherHandOfSubIsTrueVal = OtherHandOfSub == TrueVal; + Value *NewSub = SubBuilder(OtherHandOfSubIsTrueVal ? FalseVal : TrueVal); + Constant *Zero = Constant::getNullValue(Ty); + SelectInst *NewSel = + SelectInst::Create(Cond, OtherHandOfSubIsTrueVal ? Zero : NewSub, + OtherHandOfSubIsTrueVal ? NewSub : Zero); + // Preserve prof metadata if any. + NewSel->copyMetadata(cast<Instruction>(*Select)); + return NewSel; + }; + if (Instruction *NewSel = SinkSubIntoSelect( + /*Select=*/Op0, /*OtherHandOfSub=*/Op1, + [Builder = &Builder, Op1](Value *OtherHandOfSelect) { + return Builder->CreateSub(OtherHandOfSelect, + /*OtherHandOfSub=*/Op1); + })) + return NewSel; + if (Instruction *NewSel = SinkSubIntoSelect( + /*Select=*/Op1, /*OtherHandOfSub=*/Op0, + [Builder = &Builder, Op0](Value *OtherHandOfSelect) { + return Builder->CreateSub(/*OtherHandOfSub=*/Op0, + OtherHandOfSelect); + })) + return NewSel; + } + if (Op1->hasOneUse()) { Value *X = nullptr, *Y = nullptr, *Z = nullptr; Constant *C = nullptr; @@ -1717,14 +1971,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { Builder.CreateNot(Y, Y->getName() + ".not")); // 0 - (X sdiv C) -> (X sdiv -C) provided the negation doesn't overflow. - // TODO: This could be extended to match arbitrary vector constants. - const APInt *DivC; - if (match(Op0, m_Zero()) && match(Op1, m_SDiv(m_Value(X), m_APInt(DivC))) && - !DivC->isMinSignedValue() && *DivC != 1) { - Constant *NegDivC = ConstantInt::get(I.getType(), -(*DivC)); - Instruction *BO = BinaryOperator::CreateSDiv(X, NegDivC); - BO->setIsExact(cast<BinaryOperator>(Op1)->isExact()); - return BO; + if (match(Op0, m_Zero())) { + Constant *Op11C; + if (match(Op1, m_SDiv(m_Value(X), m_Constant(Op11C))) && + !Op11C->containsUndefElement() && Op11C->isNotMinSignedValue() && + Op11C->isNotOneValue()) { + Instruction *BO = + BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(Op11C)); + BO->setIsExact(cast<BinaryOperator>(Op1)->isExact()); + return BO; + } } // 0 - (X << Y) -> (-X << Y) when X is freely negatable. @@ -1742,6 +1998,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { Add->setHasNoSignedWrap(I.hasNoSignedWrap()); return Add; } + // sub [nsw] X, zext(bool Y) -> add [nsw] X, sext(bool Y) + // 'nuw' is dropped in favor of the canonical form. + if (match(Op1, m_ZExt(m_Value(Y))) && Y->getType()->isIntOrIntVectorTy(1)) { + Value *Sext = Builder.CreateSExt(Y, I.getType()); + BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Sext); + Add->setHasNoSignedWrap(I.hasNoSignedWrap()); + return Add; + } // X - A*-B -> X + A*B // X - -A*B -> X + A*B @@ -1778,7 +2042,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { std::swap(LHS, RHS); // LHS is now O above and expected to have at least 2 uses (the min/max) // NotA is epected to have 2 uses from the min/max and 1 from the sub. - if (IsFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) && + if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) && !NotA->hasNUsesOrMore(4)) { // Note: We don't generate the inverse max/min, just create the not of // it and let other folds do the rest. @@ -1796,13 +2060,15 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { Value *LHSOp, *RHSOp; if (match(Op0, m_PtrToInt(m_Value(LHSOp))) && match(Op1, m_PtrToInt(m_Value(RHSOp)))) - if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), + I.hasNoUnsignedWrap())) return replaceInstUsesWith(I, Res); // trunc(p)-trunc(q) -> trunc(p-q) if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) && match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp))))) - if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType())) + if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(), + /* IsNUW */ false)) return replaceInstUsesWith(I, Res); // Canonicalize a shifty way to code absolute value to the common pattern. @@ -1826,6 +2092,10 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { return SelectInst::Create(Cmp, Neg, A); } + if (Instruction *V = + canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) + return V; + if (Instruction *Ext = narrowMathIfNoOverflow(I)) return Ext; @@ -1865,6 +2135,22 @@ static Instruction *foldFNegIntoConstant(Instruction &I) { return nullptr; } +static Instruction *hoistFNegAboveFMulFDiv(Instruction &I, + InstCombiner::BuilderTy &Builder) { + Value *FNeg; + if (!match(&I, m_FNeg(m_Value(FNeg)))) + return nullptr; + + Value *X, *Y; + if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y))))) + return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I); + + if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y))))) + return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I); + + return nullptr; +} + Instruction *InstCombiner::visitFNeg(UnaryOperator &I) { Value *Op = I.getOperand(0); @@ -1882,6 +2168,9 @@ Instruction *InstCombiner::visitFNeg(UnaryOperator &I) { match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) return BinaryOperator::CreateFSubFMF(Y, X, &I); + if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder)) + return R; + return nullptr; } @@ -1903,6 +2192,9 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) { if (Instruction *X = foldFNegIntoConstant(I)) return X; + if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder)) + return R; + Value *X, *Y; Constant *C; @@ -1944,6 +2236,21 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) { if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y)))))) return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I); + // Similar to above, but look through fmul/fdiv of the negated value: + // Op0 - (-X * Y) --> Op0 + (X * Y) + // Op0 - (Y * -X) --> Op0 + (X * Y) + if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) { + Value *FMul = Builder.CreateFMulFMF(X, Y, &I); + return BinaryOperator::CreateFAddFMF(Op0, FMul, &I); + } + // Op0 - (-X / Y) --> Op0 + (X / Y) + // Op0 - (X / -Y) --> Op0 + (X / Y) + if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) || + match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) { + Value *FDiv = Builder.CreateFDivFMF(X, Y, &I); + return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I); + } + // Handle special cases for FSub with selects feeding the operation if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1)) return replaceInstUsesWith(I, V); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 2b9859b602f4..cc0a9127f8b1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -160,16 +160,14 @@ Instruction *InstCombiner::OptAndOp(BinaryOperator *Op, } /// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise -/// (V < Lo || V >= Hi). This method expects that Lo <= Hi. IsSigned indicates +/// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates /// whether to treat V, Lo, and Hi as signed or not. Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi, bool isSigned, bool Inside) { - assert((isSigned ? Lo.sle(Hi) : Lo.ule(Hi)) && - "Lo is not <= Hi in range emission code!"); + assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) && + "Lo is not < Hi in range emission code!"); Type *Ty = V->getType(); - if (Lo == Hi) - return Inside ? ConstantInt::getFalse(Ty) : ConstantInt::getTrue(Ty); // V >= Min && V < Hi --> V < Hi // V < Min || V >= Hi --> V >= Hi @@ -1051,9 +1049,103 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, return nullptr; } +/// Commuted variants are assumed to be handled by calling this function again +/// with the parameters swapped. +static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp, + ICmpInst *UnsignedICmp, bool IsAnd, + const SimplifyQuery &Q, + InstCombiner::BuilderTy &Builder) { + Value *ZeroCmpOp; + ICmpInst::Predicate EqPred; + if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) || + !ICmpInst::isEquality(EqPred)) + return nullptr; + + auto IsKnownNonZero = [&](Value *V) { + return isKnownNonZero(V, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT); + }; + + ICmpInst::Predicate UnsignedPred; + + Value *A, *B; + if (match(UnsignedICmp, + m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) && + match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) && + (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) { + if (UnsignedICmp->getOperand(0) != ZeroCmpOp) + UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred); + + auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) { + if (!IsKnownNonZero(NonZero)) + std::swap(NonZero, Other); + return IsKnownNonZero(NonZero); + }; + + // Given ZeroCmpOp = (A + B) + // ZeroCmpOp <= A && ZeroCmpOp != 0 --> (0-B) < A + // ZeroCmpOp > A || ZeroCmpOp == 0 --> (0-B) >= A + // + // ZeroCmpOp < A && ZeroCmpOp != 0 --> (0-X) < Y iff + // ZeroCmpOp >= A || ZeroCmpOp == 0 --> (0-X) >= Y iff + // with X being the value (A/B) that is known to be non-zero, + // and Y being remaining value. + if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE && + IsAnd) + return Builder.CreateICmpULT(Builder.CreateNeg(B), A); + if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE && + IsAnd && GetKnownNonZeroAndOther(B, A)) + return Builder.CreateICmpULT(Builder.CreateNeg(B), A); + if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ && + !IsAnd) + return Builder.CreateICmpUGE(Builder.CreateNeg(B), A); + if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ && + !IsAnd && GetKnownNonZeroAndOther(B, A)) + return Builder.CreateICmpUGE(Builder.CreateNeg(B), A); + } + + Value *Base, *Offset; + if (!match(ZeroCmpOp, m_Sub(m_Value(Base), m_Value(Offset)))) + return nullptr; + + if (!match(UnsignedICmp, + m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) || + !ICmpInst::isUnsigned(UnsignedPred)) + return nullptr; + if (UnsignedICmp->getOperand(0) != Base) + UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred); + + // Base >=/> Offset && (Base - Offset) != 0 <--> Base > Offset + // (no overflow and not null) + if ((UnsignedPred == ICmpInst::ICMP_UGE || + UnsignedPred == ICmpInst::ICMP_UGT) && + EqPred == ICmpInst::ICMP_NE && IsAnd) + return Builder.CreateICmpUGT(Base, Offset); + + // Base <=/< Offset || (Base - Offset) == 0 <--> Base <= Offset + // (overflow or null) + if ((UnsignedPred == ICmpInst::ICMP_ULE || + UnsignedPred == ICmpInst::ICMP_ULT) && + EqPred == ICmpInst::ICMP_EQ && !IsAnd) + return Builder.CreateICmpULE(Base, Offset); + + // Base <= Offset && (Base - Offset) != 0 --> Base < Offset + if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE && + IsAnd) + return Builder.CreateICmpULT(Base, Offset); + + // Base > Offset || (Base - Offset) == 0 --> Base >= Offset + if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ && + !IsAnd) + return Builder.CreateICmpUGE(Base, Offset); + + return nullptr; +} + /// Fold (icmp)&(icmp) if possible. Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI) { + const SimplifyQuery Q = SQ.getWithInstruction(&CxtI); + // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2) // if K1 and K2 are a one-bit mask. if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI)) @@ -1096,6 +1188,13 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder)) return V; + if (Value *X = + foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder)) + return X; + if (Value *X = + foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder)) + return X; + // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2). Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0); ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1)); @@ -1196,16 +1295,22 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_ULT: - if (LHSC == SubOne(RHSC)) // (X != 13 & X u< 14) -> X < 13 + // (X != 13 & X u< 14) -> X < 13 + if (LHSC->getValue() == (RHSC->getValue() - 1)) return Builder.CreateICmpULT(LHS0, LHSC); - if (LHSC->isZero()) // (X != 0 & X u< 14) -> X-1 u< 13 + if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1 return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), false, true); break; // (X != 13 & X u< 15) -> no change case ICmpInst::ICMP_SLT: - if (LHSC == SubOne(RHSC)) // (X != 13 & X s< 14) -> X < 13 + // (X != 13 & X s< 14) -> X < 13 + if (LHSC->getValue() == (RHSC->getValue() - 1)) return Builder.CreateICmpSLT(LHS0, LHSC); - break; // (X != 13 & X s< 15) -> no change + // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1)) + if (LHSC->isMinValue(true)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), + true, true); + break; // (X != 13 & X s< 15) -> no change case ICmpInst::ICMP_NE: // Potential folds for this case should already be handled. break; @@ -1216,10 +1321,15 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_NE: - if (RHSC == AddOne(LHSC)) // (X u> 13 & X != 14) -> X u> 14 + // (X u> 13 & X != 14) -> X u> 14 + if (RHSC->getValue() == (LHSC->getValue() + 1)) return Builder.CreateICmp(PredL, LHS0, RHSC); + // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1) + if (RHSC->isMaxValue(false)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), + false, true); break; // (X u> 13 & X != 15) -> no change - case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1 + case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1 return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), false, true); } @@ -1229,10 +1339,15 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_NE: - if (RHSC == AddOne(LHSC)) // (X s> 13 & X != 14) -> X s> 14 + // (X s> 13 & X != 14) -> X s> 14 + if (RHSC->getValue() == (LHSC->getValue() + 1)) return Builder.CreateICmp(PredL, LHS0, RHSC); + // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1) + if (RHSC->isMaxValue(true)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), + true, true); break; // (X s> 13 & X != 15) -> no change - case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1 + case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1 return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true, true); } @@ -1352,8 +1467,8 @@ static Instruction *matchDeMorgansLaws(BinaryOperator &I, Value *A, *B; if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) && match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) && - !IsFreeToInvert(A, A->hasOneUse()) && - !IsFreeToInvert(B, B->hasOneUse())) { + !isFreeToInvert(A, A->hasOneUse()) && + !isFreeToInvert(B, B->hasOneUse())) { Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan"); return BinaryOperator::CreateNot(AndOr); } @@ -1770,13 +1885,13 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A)))) - if (Op1->hasOneUse() || IsFreeToInvert(C, C->hasOneUse())) + if (Op1->hasOneUse() || isFreeToInvert(C, C->hasOneUse())) return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C)); // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B)))) if (match(Op1, m_Xor(m_Specific(B), m_Specific(A)))) - if (Op0->hasOneUse() || IsFreeToInvert(C, C->hasOneUse())) + if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse())) return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C)); // (A | B) & ((~A) ^ B) -> (A & B) @@ -1844,6 +1959,20 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { A->getType()->isIntOrIntVectorTy(1)) return SelectInst::Create(A, Op0, Constant::getNullValue(I.getType())); + // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0. + { + Value *X, *Y; + const APInt *ShAmt; + Type *Ty = I.getType(); + if (match(&I, m_c_And(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)), + m_APInt(ShAmt))), + m_Deferred(X))) && + *ShAmt == Ty->getScalarSizeInBits() - 1) { + Value *NewICmpInst = Builder.CreateICmpSGT(X, Y); + return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty)); + } + } + return nullptr; } @@ -2057,6 +2186,8 @@ Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B, /// Fold (icmp)|(icmp) if possible. Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI) { + const SimplifyQuery Q = SQ.getWithInstruction(&CxtI); + // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2) // if K1 and K2 are a one-bit mask. if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI)) @@ -2182,6 +2313,13 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder)) return V; + if (Value *X = + foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder)) + return X; + if (Value *X = + foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder)) + return X; + // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2). if (!LHSC || !RHSC) return nullptr; @@ -2251,8 +2389,19 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, case ICmpInst::ICMP_EQ: // Potential folds for this case should already be handled. break; - case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change - case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change + case ICmpInst::ICMP_UGT: + // (X == 0 || X u> C) -> (X-1) u>= C + if (LHSC->isMinValue(false)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1, + false, false); + // (X == 13 | X u> 14) -> no change + break; + case ICmpInst::ICMP_SGT: + // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN + if (LHSC->isMinValue(true)) + return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1, + true, false); + // (X == 13 | X s> 14) -> no change break; } break; @@ -2261,6 +2410,10 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, default: llvm_unreachable("Unknown integer condition code!"); case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change + // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C + if (RHSC->isMaxValue(false)) + return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(), + false, false); break; case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2 assert(!RHSC->isMaxValue(false) && "Missed icmp simplification"); @@ -2272,9 +2425,14 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, switch (PredR) { default: llvm_unreachable("Unknown integer condition code!"); - case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change + case ICmpInst::ICMP_EQ: + // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C + if (RHSC->isMaxValue(true)) + return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(), + true, false); + // (X s< 13 | X == 14) -> no change break; - case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2 + case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2 assert(!RHSC->isMaxValue(true) && "Missed icmp simplification"); return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true, false); @@ -2552,6 +2710,25 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { } } + // or(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? -1 : X. + { + Value *X, *Y; + const APInt *ShAmt; + Type *Ty = I.getType(); + if (match(&I, m_c_Or(m_OneUse(m_AShr(m_NSWSub(m_Value(Y), m_Value(X)), + m_APInt(ShAmt))), + m_Deferred(X))) && + *ShAmt == Ty->getScalarSizeInBits() - 1) { + Value *NewICmpInst = Builder.CreateICmpSGT(X, Y); + return SelectInst::Create(NewICmpInst, ConstantInt::getAllOnesValue(Ty), + X); + } + } + + if (Instruction *V = + canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I)) + return V; + return nullptr; } @@ -2617,7 +2794,11 @@ static Instruction *foldXorToXor(BinaryOperator &I, return nullptr; } -Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) { +Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, + BinaryOperator &I) { + assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS && + I.getOperand(1) == RHS && "Should be 'xor' with these operands"); + if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) { if (LHS->getOperand(0) == RHS->getOperand(1) && LHS->getOperand(1) == RHS->getOperand(0)) @@ -2672,14 +2853,35 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) { // TODO: If OrICmp is false, the whole thing is false (InstSimplify?). if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) { // TODO: Independently handle cases where the 'and' side is a constant. - if (OrICmp == LHS && AndICmp == RHS && RHS->hasOneUse()) { - // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS - RHS->setPredicate(RHS->getInversePredicate()); - return Builder.CreateAnd(LHS, RHS); + ICmpInst *X = nullptr, *Y = nullptr; + if (OrICmp == LHS && AndICmp == RHS) { + // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS --> X & !Y + X = LHS; + Y = RHS; } - if (OrICmp == RHS && AndICmp == LHS && LHS->hasOneUse()) { - // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS - LHS->setPredicate(LHS->getInversePredicate()); + if (OrICmp == RHS && AndICmp == LHS) { + // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS --> !Y & X + X = RHS; + Y = LHS; + } + if (X && Y && (Y->hasOneUse() || canFreelyInvertAllUsersOf(Y, &I))) { + // Invert the predicate of 'Y', thus inverting its output. + Y->setPredicate(Y->getInversePredicate()); + // So, are there other uses of Y? + if (!Y->hasOneUse()) { + // We need to adapt other uses of Y though. Get a value that matches + // the original value of Y before inversion. While this increases + // immediate instruction count, we have just ensured that all the + // users are freely-invertible, so that 'not' *will* get folded away. + BuilderTy::InsertPointGuard Guard(Builder); + // Set insertion point to right after the Y. + Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator())); + Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not"); + // Replace all uses of Y (excluding the one in NotY!) with NotY. + Y->replaceUsesWithIf(NotY, + [NotY](Use &U) { return U.getUser() != NotY; }); + } + // All done. return Builder.CreateAnd(LHS, RHS); } } @@ -2747,9 +2949,9 @@ static Instruction *sinkNotIntoXor(BinaryOperator &I, return nullptr; // We only want to do the transform if it is free to do. - if (IsFreeToInvert(X, X->hasOneUse())) { + if (isFreeToInvert(X, X->hasOneUse())) { // Ok, good. - } else if (IsFreeToInvert(Y, Y->hasOneUse())) { + } else if (isFreeToInvert(Y, Y->hasOneUse())) { std::swap(X, Y); } else return nullptr; @@ -2827,9 +3029,9 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { // Apply DeMorgan's Law when inverts are free: // ~(X & Y) --> (~X | ~Y) // ~(X | Y) --> (~X & ~Y) - if (IsFreeToInvert(NotVal->getOperand(0), + if (isFreeToInvert(NotVal->getOperand(0), NotVal->getOperand(0)->hasOneUse()) && - IsFreeToInvert(NotVal->getOperand(1), + isFreeToInvert(NotVal->getOperand(1), NotVal->getOperand(1)->hasOneUse())) { Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs"); Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs"); @@ -3004,7 +3206,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0))) if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) - if (Value *V = foldXorOfICmps(LHS, RHS)) + if (Value *V = foldXorOfICmps(LHS, RHS, I)) return replaceInstUsesWith(I, V); if (Instruction *CastedXor = foldCastedBitwiseLogic(I)) @@ -3052,7 +3254,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (SelectPatternResult::isMinOrMax(SPF)) { // It's possible we get here before the not has been simplified, so make // sure the input to the not isn't freely invertible. - if (match(LHS, m_Not(m_Value(X))) && !IsFreeToInvert(X, X->hasOneUse())) { + if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) { Value *NotY = Builder.CreateNot(RHS); return SelectInst::Create( Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY); @@ -3060,7 +3262,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { // It's possible we get here before the not has been simplified, so make // sure the input to the not isn't freely invertible. - if (match(RHS, m_Not(m_Value(Y))) && !IsFreeToInvert(Y, Y->hasOneUse())) { + if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) { Value *NotX = Builder.CreateNot(LHS); return SelectInst::Create( Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y); @@ -3068,8 +3270,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { // If both sides are freely invertible, then we can get rid of the xor // completely. - if (IsFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) && - IsFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) { + if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) && + isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) { Value *NotLHS = Builder.CreateNot(LHS); Value *NotRHS = Builder.CreateNot(RHS); return SelectInst::Create( @@ -3077,6 +3279,23 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { NotLHS, NotRHS); } } + + // Pull 'not' into operands of select if both operands are one-use compares. + // Inverting the predicates eliminates the 'not' operation. + // Example: + // not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) --> + // select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?) + // TODO: Canonicalize by hoisting 'not' into an arm of the select if only + // 1 select operand is a cmp? + if (auto *Sel = dyn_cast<SelectInst>(Op0)) { + auto *CmpT = dyn_cast<CmpInst>(Sel->getTrueValue()); + auto *CmpF = dyn_cast<CmpInst>(Sel->getFalseValue()); + if (CmpT && CmpF && CmpT->hasOneUse() && CmpF->hasOneUse()) { + CmpT->setPredicate(CmpT->getInversePredicate()); + CmpF->setPredicate(CmpF->getInversePredicate()); + return replaceInstUsesWith(I, Sel); + } + } } if (Instruction *NewXor = sinkNotIntoXor(I, Builder)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp index 5f37a00f56cf..825f4b468b0a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -124,7 +124,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) { auto *SI = new StoreInst(RMWI.getValOperand(), RMWI.getPointerOperand(), &RMWI); SI->setAtomic(Ordering, RMWI.getSyncScopeID()); - SI->setAlignment(DL.getABITypeAlignment(RMWI.getType())); + SI->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType()))); return eraseInstFromFunction(RMWI); } @@ -154,6 +154,6 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) { LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand()); Load->setAtomic(Ordering, RMWI.getSyncScopeID()); - Load->setAlignment(DL.getABITypeAlignment(RMWI.getType())); + Load->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType()))); return Load; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 4b3333affa72..f463c5fa1138 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -40,6 +40,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsX86.h" +#include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IntrinsicsAArch64.h" +#include "llvm/IR/IntrinsicsNVPTX.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsPowerPC.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" @@ -185,7 +191,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); LoadInst *L = Builder.CreateLoad(IntType, Src); // Alignment from the mem intrinsic will be better, so use it. - L->setAlignment(CopySrcAlign); + L->setAlignment( + MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. if (CopyMD) L->setMetadata(LLVMContext::MD_tbaa, CopyMD); MDNode *LoopMemParallelMD = @@ -198,7 +205,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { StoreInst *S = Builder.CreateStore(L, Dest); // Alignment from the mem intrinsic will be better, so use it. - S->setAlignment(CopyDstAlign); + S->setAlignment( + MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. if (CopyMD) S->setMetadata(LLVMContext::MD_tbaa, CopyMD); if (LoopMemParallelMD) @@ -223,9 +231,10 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { } Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { - unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); - if (MI->getDestAlignment() < Alignment) { - MI->setDestAlignment(Alignment); + const unsigned KnownAlignment = + getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); + if (MI->getDestAlignment() < KnownAlignment) { + MI->setDestAlignment(KnownAlignment); return MI; } @@ -243,13 +252,9 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue()); if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) return nullptr; - uint64_t Len = LenC->getLimitedValue(); - Alignment = MI->getDestAlignment(); + const uint64_t Len = LenC->getLimitedValue(); assert(Len && "0-sized memory setting should be removed already."); - - // Alignment 0 is identity for alignment 1 for memset, but not store. - if (Alignment == 0) - Alignment = 1; + const Align Alignment = assumeAligned(MI->getDestAlignment()); // If it is an atomic and alignment is less than the size then we will // introduce the unaligned memory access which will be later transformed @@ -1060,9 +1065,9 @@ Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) { // If we can unconditionally load from this address, replace with a // load/select idiom. TODO: use DT for context sensitive query - if (isDereferenceableAndAlignedPointer(LoadPtr, II.getType(), Alignment, - II.getModule()->getDataLayout(), - &II, nullptr)) { + if (isDereferenceableAndAlignedPointer( + LoadPtr, II.getType(), MaybeAlign(Alignment), + II.getModule()->getDataLayout(), &II, nullptr)) { Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment, "unmaskedload"); return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3)); @@ -1086,7 +1091,8 @@ Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) { // If the mask is all ones, this is a plain vector store of the 1st argument. if (ConstMask->isAllOnesValue()) { Value *StorePtr = II.getArgOperand(1); - unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue(); + MaybeAlign Alignment( + cast<ConstantInt>(II.getArgOperand(2))->getZExtValue()); return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment); } @@ -2234,6 +2240,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return replaceInstUsesWith(*II, Add); } + // Try to simplify the underlying FMul. + if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1), + II->getFastMathFlags(), + SQ.getWithInstruction(II))) { + auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); + FAdd->copyFastMathFlags(II); + return FAdd; + } + LLVM_FALLTHROUGH; } case Intrinsic::fma: { @@ -2258,15 +2273,47 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return II; } - // fma x, 1, z -> fadd x, z - if (match(Src1, m_FPOne())) { - auto *FAdd = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2)); + // Try to simplify the underlying FMul. We can only apply simplifications + // that do not require rounding. + if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1), + II->getFastMathFlags(), + SQ.getWithInstruction(II))) { + auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2)); FAdd->copyFastMathFlags(II); return FAdd; } break; } + case Intrinsic::copysign: { + if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) { + // If we know that the sign argument is positive, reduce to FABS: + // copysign X, Pos --> fabs X + Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, + II->getArgOperand(0), II); + return replaceInstUsesWith(*II, Fabs); + } + // TODO: There should be a ValueTracking sibling like SignBitMustBeOne. + const APFloat *C; + if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) { + // If we know that the sign argument is negative, reduce to FNABS: + // copysign X, Neg --> fneg (fabs X) + Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, + II->getArgOperand(0), II); + return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II)); + } + + // Propagate sign argument through nested calls: + // copysign X, (copysign ?, SignArg) --> copysign X, SignArg + Value *SignArg; + if (match(II->getArgOperand(1), + m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) { + II->setArgOperand(1, SignArg); + return II; + } + + break; + } case Intrinsic::fabs: { Value *Cond; Constant *LHS, *RHS; @@ -2331,7 +2378,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Turn PPC VSX loads into normal loads. Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0), PointerType::getUnqual(II->getType())); - return new LoadInst(II->getType(), Ptr, Twine(""), false, 1); + return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None()); } case Intrinsic::ppc_altivec_stvx: case Intrinsic::ppc_altivec_stvxl: @@ -2349,7 +2396,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Turn PPC VSX stores into normal stores. Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType()); Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy); - return new StoreInst(II->getArgOperand(0), Ptr, false, 1); + return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None()); } case Intrinsic::ppc_qpx_qvlfs: // Turn PPC QPX qvlfs -> load if the pointer is known aligned. @@ -2440,6 +2487,64 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // TODO should we convert this to an AND if the RHS is constant? } break; + case Intrinsic::x86_bmi_pext_32: + case Intrinsic::x86_bmi_pext_64: + if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + if (MaskC->isNullValue()) + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); + if (MaskC->isAllOnesValue()) + return replaceInstUsesWith(CI, II->getArgOperand(0)); + + if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToSet = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToTest = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToSet <<= 1; + // Clear lowest set bit. + Mask &= Mask - 1; + } + + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); + } + } + break; + case Intrinsic::x86_bmi_pdep_32: + case Intrinsic::x86_bmi_pdep_64: + if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) { + if (MaskC->isNullValue()) + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0)); + if (MaskC->isAllOnesValue()) + return replaceInstUsesWith(CI, II->getArgOperand(0)); + + if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) { + uint64_t Src = SrcC->getZExtValue(); + uint64_t Mask = MaskC->getZExtValue(); + uint64_t Result = 0; + uint64_t BitToTest = 1; + + while (Mask) { + // Isolate lowest set bit. + uint64_t BitToSet = Mask & -Mask; + if (BitToTest & Src) + Result |= BitToSet; + + BitToTest <<= 1; + // Clear lowest set bit; + Mask &= Mask - 1; + } + + return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result)); + } + } + break; case Intrinsic::x86_vcvtph2ps_128: case Intrinsic::x86_vcvtph2ps_256: { @@ -3296,6 +3401,60 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { } break; } + case Intrinsic::arm_mve_pred_i2v: { + Value *Arg = II->getArgOperand(0); + Value *ArgArg; + if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) && + II->getType() == ArgArg->getType()) + return replaceInstUsesWith(*II, ArgArg); + Constant *XorMask; + if (match(Arg, + m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)), + m_Constant(XorMask))) && + II->getType() == ArgArg->getType()) { + if (auto *CI = dyn_cast<ConstantInt>(XorMask)) { + if (CI->getValue().trunc(16).isAllOnesValue()) { + auto TrueVector = Builder.CreateVectorSplat( + II->getType()->getVectorNumElements(), Builder.getTrue()); + return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector); + } + } + } + KnownBits ScalarKnown(32); + if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16), + ScalarKnown, 0)) + return II; + break; + } + case Intrinsic::arm_mve_pred_v2i: { + Value *Arg = II->getArgOperand(0); + Value *ArgArg; + if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg)))) + return replaceInstUsesWith(*II, ArgArg); + if (!II->getMetadata(LLVMContext::MD_range)) { + Type *IntTy32 = Type::getInt32Ty(II->getContext()); + Metadata *M[] = { + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)), + ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF)) + }; + II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M)); + return II; + } + break; + } + case Intrinsic::arm_mve_vadc: + case Intrinsic::arm_mve_vadc_predicated: { + unsigned CarryOp = + (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2; + assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 && + "Bad type for intrinsic!"); + + KnownBits CarryKnown(32); + if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29), + CarryKnown)) + return II; + break; + } case Intrinsic::amdgcn_rcp: { Value *Src = II->getArgOperand(0); @@ -3305,7 +3464,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { const APFloat &ArgVal = C->getValueAPF(); - APFloat Val(ArgVal.getSemantics(), 1.0); + APFloat Val(ArgVal.getSemantics(), 1); APFloat::opStatus Status = Val.divide(ArgVal, APFloat::rmNearestTiesToEven); // Only do this if it was exact and therefore not dependent on the @@ -3860,7 +4019,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { return eraseInstFromFunction(CI); // Bail if we cross over an intrinsic with side effects, such as - // llvm.stacksave, llvm.read_register, or llvm.setjmp. + // llvm.stacksave, or llvm.read_register. if (II2->mayHaveSideEffects()) { CannotRemove = true; break; @@ -3885,6 +4044,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Asan needs to poison memory to detect invalid access which is possible // even for empty lifetime range. if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) || + II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) || II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress)) break; @@ -3950,10 +4110,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } case Intrinsic::experimental_gc_relocate: { + auto &GCR = *cast<GCRelocateInst>(II); + + // If we have two copies of the same pointer in the statepoint argument + // list, canonicalize to one. This may let us common gc.relocates. + if (GCR.getBasePtr() == GCR.getDerivedPtr() && + GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) { + auto *OpIntTy = GCR.getOperand(2)->getType(); + II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex())); + return II; + } + // Translate facts known about a pointer before relocating into // facts about the relocate value, while being careful to // preserve relocation semantics. - Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr(); + Value *DerivedPtr = GCR.getDerivedPtr(); // Remove the relocation if unused, note that this check is required // to prevent the cases below from looping forever. @@ -3995,12 +4166,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Is this guard followed by another guard? We scan forward over a small // fixed window of instructions to handle common cases with conditions // computed between guards. - Instruction *NextInst = II->getNextNode(); + Instruction *NextInst = II->getNextNonDebugInstruction(); for (unsigned i = 0; i < GuardWideningWindow; i++) { // Note: Using context-free form to avoid compile time blow up if (!isSafeToSpeculativelyExecute(NextInst)) break; - NextInst = NextInst->getNextNode(); + NextInst = NextInst->getNextNonDebugInstruction(); } Value *NextCond = nullptr; if (match(NextInst, @@ -4008,18 +4179,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Value *CurrCond = II->getArgOperand(0); // Remove a guard that it is immediately preceded by an identical guard. - if (CurrCond == NextCond) - return eraseInstFromFunction(*NextInst); - // Otherwise canonicalize guard(a); guard(b) -> guard(a & b). - Instruction* MoveI = II->getNextNode(); - while (MoveI != NextInst) { - auto *Temp = MoveI; - MoveI = MoveI->getNextNode(); - Temp->moveBefore(II); + if (CurrCond != NextCond) { + Instruction *MoveI = II->getNextNonDebugInstruction(); + while (MoveI != NextInst) { + auto *Temp = MoveI; + MoveI = MoveI->getNextNonDebugInstruction(); + Temp->moveBefore(II); + } + II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond)); } - II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond)); - return eraseInstFromFunction(*NextInst); + eraseInstFromFunction(*NextInst); + return II; } break; } @@ -4177,10 +4348,58 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) { return nullptr; } +static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) { + unsigned NumArgs = Call.getNumArgOperands(); + ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0)); + ConstantInt *Op1C = + (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1)); + // Bail out if the allocation size is zero. + if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue())) + return; + + if (isMallocLikeFn(&Call, TLI) && Op0C) { + if (isOpNewLikeFn(&Call, TLI)) + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableBytes( + Call.getContext(), Op0C->getZExtValue())); + else + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), Op0C->getZExtValue())); + } else if (isReallocLikeFn(&Call, TLI) && Op1C) { + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), Op1C->getZExtValue())); + } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) { + bool Overflow; + const APInt &N = Op0C->getValue(); + APInt Size = N.umul_ov(Op1C->getValue(), Overflow); + if (!Overflow) + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), Size.getZExtValue())); + } else if (isStrdupLikeFn(&Call, TLI)) { + uint64_t Len = GetStringLength(Call.getOperand(0)); + if (Len) { + // strdup + if (NumArgs == 1) + Call.addAttribute(AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), Len)); + // strndup + else if (NumArgs == 2 && Op1C) + Call.addAttribute( + AttributeList::ReturnIndex, + Attribute::getWithDereferenceableOrNullBytes( + Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1))); + } + } +} + /// Improvements for call, callbr and invoke instructions. Instruction *InstCombiner::visitCallBase(CallBase &Call) { - if (isAllocLikeFn(&Call, &TLI)) - return visitAllocSite(Call); + if (isAllocationFn(&Call, &TLI)) + annotateAnyAllocSite(Call, &TLI); bool Changed = false; @@ -4312,6 +4531,9 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) { if (I) return eraseInstFromFunction(*I); } + if (isAllocLikeFn(&Call, &TLI)) + return visitAllocSite(Call); + return Changed ? &Call : nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 2c9ba203fbf3..71b7f279e5fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/DIBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" +#include <numeric> using namespace llvm; using namespace PatternMatch; @@ -140,7 +141,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI, } AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt); - New->setAlignment(AI.getAlignment()); + New->setAlignment(MaybeAlign(AI.getAlignment())); New->takeName(&AI); New->setUsedWithInAlloca(AI.isUsedWithInAlloca()); @@ -843,33 +844,33 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { return nullptr; } -Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, +Instruction *InstCombiner::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext, bool DoTransform) { // If we are just checking for a icmp eq of a single bit and zext'ing it // to an integer, then shift the bit to the appropriate place and then // cast to integer to avoid the comparison. const APInt *Op1CV; - if (match(ICI->getOperand(1), m_APInt(Op1CV))) { + if (match(Cmp->getOperand(1), m_APInt(Op1CV))) { // zext (x <s 0) to i32 --> x>>u31 true if signbit set. // zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear. - if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) || - (ICI->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) { - if (!DoTransform) return ICI; + if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) || + (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) { + if (!DoTransform) return Cmp; - Value *In = ICI->getOperand(0); + Value *In = Cmp->getOperand(0); Value *Sh = ConstantInt::get(In->getType(), In->getType()->getScalarSizeInBits() - 1); In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit"); - if (In->getType() != CI.getType()) - In = Builder.CreateIntCast(In, CI.getType(), false /*ZExt*/); + if (In->getType() != Zext.getType()) + In = Builder.CreateIntCast(In, Zext.getType(), false /*ZExt*/); - if (ICI->getPredicate() == ICmpInst::ICMP_SGT) { + if (Cmp->getPredicate() == ICmpInst::ICMP_SGT) { Constant *One = ConstantInt::get(In->getType(), 1); In = Builder.CreateXor(In, One, In->getName() + ".not"); } - return replaceInstUsesWith(CI, In); + return replaceInstUsesWith(Zext, In); } // zext (X == 0) to i32 --> X^1 iff X has only the low bit set. @@ -882,24 +883,24 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set. if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) && // This only works for EQ and NE - ICI->isEquality()) { + Cmp->isEquality()) { // If Op1C some other power of two, convert: - KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI); + KnownBits Known = computeKnownBits(Cmp->getOperand(0), 0, &Zext); APInt KnownZeroMask(~Known.Zero); if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1? - if (!DoTransform) return ICI; + if (!DoTransform) return Cmp; - bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE; + bool isNE = Cmp->getPredicate() == ICmpInst::ICMP_NE; if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) { // (X&4) == 2 --> false // (X&4) != 2 --> true - Constant *Res = ConstantInt::get(CI.getType(), isNE); - return replaceInstUsesWith(CI, Res); + Constant *Res = ConstantInt::get(Zext.getType(), isNE); + return replaceInstUsesWith(Zext, Res); } uint32_t ShAmt = KnownZeroMask.logBase2(); - Value *In = ICI->getOperand(0); + Value *In = Cmp->getOperand(0); if (ShAmt) { // Perform a logical shr by shiftamt. // Insert the shift to put the result in the low bit. @@ -912,11 +913,11 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, In = Builder.CreateXor(In, One); } - if (CI.getType() == In->getType()) - return replaceInstUsesWith(CI, In); + if (Zext.getType() == In->getType()) + return replaceInstUsesWith(Zext, In); - Value *IntCast = Builder.CreateIntCast(In, CI.getType(), false); - return replaceInstUsesWith(CI, IntCast); + Value *IntCast = Builder.CreateIntCast(In, Zext.getType(), false); + return replaceInstUsesWith(Zext, IntCast); } } } @@ -924,19 +925,19 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, // icmp ne A, B is equal to xor A, B when A and B only really have one bit. // It is also profitable to transform icmp eq into not(xor(A, B)) because that // may lead to additional simplifications. - if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) { - if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) { - Value *LHS = ICI->getOperand(0); - Value *RHS = ICI->getOperand(1); + if (Cmp->isEquality() && Zext.getType() == Cmp->getOperand(0)->getType()) { + if (IntegerType *ITy = dyn_cast<IntegerType>(Zext.getType())) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); - KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI); - KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI); + KnownBits KnownLHS = computeKnownBits(LHS, 0, &Zext); + KnownBits KnownRHS = computeKnownBits(RHS, 0, &Zext); if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) { APInt KnownBits = KnownLHS.Zero | KnownLHS.One; APInt UnknownBit = ~KnownBits; if (UnknownBit.countPopulation() == 1) { - if (!DoTransform) return ICI; + if (!DoTransform) return Cmp; Value *Result = Builder.CreateXor(LHS, RHS); @@ -949,10 +950,10 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI, Result = Builder.CreateLShr( Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros())); - if (ICI->getPredicate() == ICmpInst::ICMP_EQ) + if (Cmp->getPredicate() == ICmpInst::ICMP_EQ) Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1)); - Result->takeName(ICI); - return replaceInstUsesWith(CI, Result); + Result->takeName(Cmp); + return replaceInstUsesWith(Zext, Result); } } } @@ -1172,8 +1173,8 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { } } - if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src)) - return transformZExtICmp(ICI, CI); + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Src)) + return transformZExtICmp(Cmp, CI); BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src); if (SrcI && SrcI->getOpcode() == Instruction::Or) { @@ -1188,7 +1189,9 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName()); Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName()); - BinaryOperator *Or = BinaryOperator::Create(Instruction::Or, LCast, RCast); + Value *Or = Builder.CreateOr(LCast, RCast, CI.getName()); + if (auto *OrInst = dyn_cast<Instruction>(Or)) + Builder.SetInsertPoint(OrInst); // Perform the elimination. if (auto *LZExt = dyn_cast<ZExtInst>(LCast)) @@ -1196,7 +1199,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) { if (auto *RZExt = dyn_cast<ZExtInst>(RCast)) transformZExtICmp(RHS, *RZExt); - return Or; + return replaceInstUsesWith(CI, Or); } } @@ -1531,16 +1534,16 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { // what we can and cannot do safely varies from operation to operation, and // is explained below in the various case statements. Type *Ty = FPT.getType(); - BinaryOperator *OpI = dyn_cast<BinaryOperator>(FPT.getOperand(0)); - if (OpI && OpI->hasOneUse()) { - Type *LHSMinType = getMinimumFPType(OpI->getOperand(0)); - Type *RHSMinType = getMinimumFPType(OpI->getOperand(1)); - unsigned OpWidth = OpI->getType()->getFPMantissaWidth(); + auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0)); + if (BO && BO->hasOneUse()) { + Type *LHSMinType = getMinimumFPType(BO->getOperand(0)); + Type *RHSMinType = getMinimumFPType(BO->getOperand(1)); + unsigned OpWidth = BO->getType()->getFPMantissaWidth(); unsigned LHSWidth = LHSMinType->getFPMantissaWidth(); unsigned RHSWidth = RHSMinType->getFPMantissaWidth(); unsigned SrcWidth = std::max(LHSWidth, RHSWidth); unsigned DstWidth = Ty->getFPMantissaWidth(); - switch (OpI->getOpcode()) { + switch (BO->getOpcode()) { default: break; case Instruction::FAdd: case Instruction::FSub: @@ -1563,10 +1566,10 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { // could be tightened for those cases, but they are rare (the main // case of interest here is (float)((double)float + float)). if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) { - Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty); - Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty); - Instruction *RI = BinaryOperator::Create(OpI->getOpcode(), LHS, RHS); - RI->copyFastMathFlags(OpI); + Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); + Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); + Instruction *RI = BinaryOperator::Create(BO->getOpcode(), LHS, RHS); + RI->copyFastMathFlags(BO); return RI; } break; @@ -1577,9 +1580,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { // rounding can possibly occur; we can safely perform the operation // in the destination format if it can represent both sources. if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) { - Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty); - Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty); - return BinaryOperator::CreateFMulFMF(LHS, RHS, OpI); + Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); + Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); + return BinaryOperator::CreateFMulFMF(LHS, RHS, BO); } break; case Instruction::FDiv: @@ -1590,9 +1593,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { // condition used here is a good conservative first pass. // TODO: Tighten bound via rigorous analysis of the unbalanced case. if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) { - Value *LHS = Builder.CreateFPTrunc(OpI->getOperand(0), Ty); - Value *RHS = Builder.CreateFPTrunc(OpI->getOperand(1), Ty); - return BinaryOperator::CreateFDivFMF(LHS, RHS, OpI); + Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty); + Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty); + return BinaryOperator::CreateFDivFMF(LHS, RHS, BO); } break; case Instruction::FRem: { @@ -1604,14 +1607,14 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { break; Value *LHS, *RHS; if (LHSWidth == SrcWidth) { - LHS = Builder.CreateFPTrunc(OpI->getOperand(0), LHSMinType); - RHS = Builder.CreateFPTrunc(OpI->getOperand(1), LHSMinType); + LHS = Builder.CreateFPTrunc(BO->getOperand(0), LHSMinType); + RHS = Builder.CreateFPTrunc(BO->getOperand(1), LHSMinType); } else { - LHS = Builder.CreateFPTrunc(OpI->getOperand(0), RHSMinType); - RHS = Builder.CreateFPTrunc(OpI->getOperand(1), RHSMinType); + LHS = Builder.CreateFPTrunc(BO->getOperand(0), RHSMinType); + RHS = Builder.CreateFPTrunc(BO->getOperand(1), RHSMinType); } - Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, OpI); + Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, BO); return CastInst::CreateFPCast(ExactResult, Ty); } } @@ -1621,6 +1624,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { Value *X; Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0)); if (Op && Op->hasOneUse()) { + // FIXME: The FMF should propagate from the fptrunc, not the source op. + IRBuilder<>::FastMathFlagGuard FMFG(Builder); + if (isa<FPMathOperator>(Op)) + Builder.setFastMathFlags(Op->getFastMathFlags()); + if (match(Op, m_FNeg(m_Value(X)))) { Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty); @@ -1630,6 +1638,24 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) { return BinaryOperator::CreateFNegFMF(InnerTrunc, Op); return UnaryOperator::CreateFNegFMF(InnerTrunc, Op); } + + // If we are truncating a select that has an extended operand, we can + // narrow the other operand and do the select as a narrow op. + Value *Cond, *X, *Y; + if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) && + X->getType() == Ty) { + // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y) + Value *NarrowY = Builder.CreateFPTrunc(Y, Ty); + Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op); + return replaceInstUsesWith(FPT, Sel); + } + if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) && + X->getType() == Ty) { + // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X + Value *NarrowY = Builder.CreateFPTrunc(Y, Ty); + Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op); + return replaceInstUsesWith(FPT, Sel); + } } if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) { @@ -1808,7 +1834,7 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { Type *Ty = CI.getType(); unsigned AS = CI.getPointerAddressSpace(); - if (Ty->getScalarSizeInBits() == DL.getIndexSizeInBits(AS)) + if (Ty->getScalarSizeInBits() == DL.getPointerSizeInBits(AS)) return commonPointerCastTransforms(CI); Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS); @@ -1820,12 +1846,24 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) { } /// This input value (which is known to have vector type) is being zero extended -/// or truncated to the specified vector type. +/// or truncated to the specified vector type. Since the zext/trunc is done +/// using an integer type, we have a (bitcast(cast(bitcast))) pattern, +/// endianness will impact which end of the vector that is extended or +/// truncated. +/// +/// A vector is always stored with index 0 at the lowest address, which +/// corresponds to the most significant bits for a big endian stored integer and +/// the least significant bits for little endian. A trunc/zext of an integer +/// impacts the big end of the integer. Thus, we need to add/remove elements at +/// the front of the vector for big endian targets, and the back of the vector +/// for little endian targets. +/// /// Try to replace it with a shuffle (and vector/vector bitcast) if possible. /// /// The source and destination vector types may have different element types. -static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy, - InstCombiner &IC) { +static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal, + VectorType *DestTy, + InstCombiner &IC) { // We can only do this optimization if the output is a multiple of the input // element size, or the input is a multiple of the output element size. // Convert the input type to have the same element type as the output. @@ -1844,31 +1882,53 @@ static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy, InVal = IC.Builder.CreateBitCast(InVal, SrcTy); } + bool IsBigEndian = IC.getDataLayout().isBigEndian(); + unsigned SrcElts = SrcTy->getNumElements(); + unsigned DestElts = DestTy->getNumElements(); + + assert(SrcElts != DestElts && "Element counts should be different."); + // Now that the element types match, get the shuffle mask and RHS of the // shuffle to use, which depends on whether we're increasing or decreasing the // size of the input. - SmallVector<uint32_t, 16> ShuffleMask; + SmallVector<uint32_t, 16> ShuffleMaskStorage; + ArrayRef<uint32_t> ShuffleMask; Value *V2; - if (SrcTy->getNumElements() > DestTy->getNumElements()) { - // If we're shrinking the number of elements, just shuffle in the low - // elements from the input and use undef as the second shuffle input. - V2 = UndefValue::get(SrcTy); - for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i) - ShuffleMask.push_back(i); + // Produce an identify shuffle mask for the src vector. + ShuffleMaskStorage.resize(SrcElts); + std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0); + if (SrcElts > DestElts) { + // If we're shrinking the number of elements (rewriting an integer + // truncate), just shuffle in the elements corresponding to the least + // significant bits from the input and use undef as the second shuffle + // input. + V2 = UndefValue::get(SrcTy); + // Make sure the shuffle mask selects the "least significant bits" by + // keeping elements from back of the src vector for big endian, and from the + // front for little endian. + ShuffleMask = ShuffleMaskStorage; + if (IsBigEndian) + ShuffleMask = ShuffleMask.take_back(DestElts); + else + ShuffleMask = ShuffleMask.take_front(DestElts); } else { - // If we're increasing the number of elements, shuffle in all of the - // elements from InVal and fill the rest of the result elements with zeros - // from a constant zero. + // If we're increasing the number of elements (rewriting an integer zext), + // shuffle in all of the elements from InVal. Fill the rest of the result + // elements with zeros from a constant zero. V2 = Constant::getNullValue(SrcTy); - unsigned SrcElts = SrcTy->getNumElements(); - for (unsigned i = 0, e = SrcElts; i != e; ++i) - ShuffleMask.push_back(i); - - // The excess elements reference the first element of the zero input. - for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i) - ShuffleMask.push_back(SrcElts); + // Use first elt from V2 when indicating zero in the shuffle mask. + uint32_t NullElt = SrcElts; + // Extend with null values in the "most significant bits" by adding elements + // in front of the src vector for big endian, and at the back for little + // endian. + unsigned DeltaElts = DestElts - SrcElts; + if (IsBigEndian) + ShuffleMaskStorage.insert(ShuffleMaskStorage.begin(), DeltaElts, NullElt); + else + ShuffleMaskStorage.append(DeltaElts, NullElt); + ShuffleMask = ShuffleMaskStorage; } return new ShuffleVectorInst(InVal, V2, @@ -2217,6 +2277,31 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) { } } + // Check that each user of each old PHI node is something that we can + // rewrite, so that all of the old PHI nodes can be cleaned up afterwards. + for (auto *OldPN : OldPhiNodes) { + for (User *V : OldPN->users()) { + if (auto *SI = dyn_cast<StoreInst>(V)) { + if (!SI->isSimple() || SI->getOperand(0) != OldPN) + return nullptr; + } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { + // Verify it's a B->A cast. + Type *TyB = BCI->getOperand(0)->getType(); + Type *TyA = BCI->getType(); + if (TyA != DestTy || TyB != SrcTy) + return nullptr; + } else if (auto *PHI = dyn_cast<PHINode>(V)) { + // As long as the user is another old PHI node, then even if we don't + // rewrite it, the PHI web we're considering won't have any users + // outside itself, so it'll be dead. + if (OldPhiNodes.count(PHI) == 0) + return nullptr; + } else { + return nullptr; + } + } + } + // For each old PHI node, create a corresponding new PHI node with a type A. SmallDenseMap<PHINode *, PHINode *> NewPNodes; for (auto *OldPN : OldPhiNodes) { @@ -2234,9 +2319,14 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) { if (auto *C = dyn_cast<Constant>(V)) { NewV = ConstantExpr::getBitCast(C, DestTy); } else if (auto *LI = dyn_cast<LoadInst>(V)) { - Builder.SetInsertPoint(LI->getNextNode()); - NewV = Builder.CreateBitCast(LI, DestTy); - Worklist.Add(LI); + // Explicitly perform load combine to make sure no opposing transform + // can remove the bitcast in the meantime and trigger an infinite loop. + Builder.SetInsertPoint(LI); + NewV = combineLoadToNewType(*LI, DestTy); + // Remove the old load and its use in the old phi, which itself becomes + // dead once the whole transform finishes. + replaceInstUsesWith(*LI, UndefValue::get(LI->getType())); + eraseInstFromFunction(*LI); } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { NewV = BCI->getOperand(0); } else if (auto *PrevPN = dyn_cast<PHINode>(V)) { @@ -2259,26 +2349,33 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) { Instruction *RetVal = nullptr; for (auto *OldPN : OldPhiNodes) { PHINode *NewPN = NewPNodes[OldPN]; - for (User *V : OldPN->users()) { + for (auto It = OldPN->user_begin(), End = OldPN->user_end(); It != End; ) { + User *V = *It; + // We may remove this user, advance to avoid iterator invalidation. + ++It; if (auto *SI = dyn_cast<StoreInst>(V)) { - if (SI->isSimple() && SI->getOperand(0) == OldPN) { - Builder.SetInsertPoint(SI); - auto *NewBC = - cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy)); - SI->setOperand(0, NewBC); - Worklist.Add(SI); - assert(hasStoreUsersOnly(*NewBC)); - } + assert(SI->isSimple() && SI->getOperand(0) == OldPN); + Builder.SetInsertPoint(SI); + auto *NewBC = + cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy)); + SI->setOperand(0, NewBC); + Worklist.Add(SI); + assert(hasStoreUsersOnly(*NewBC)); } else if (auto *BCI = dyn_cast<BitCastInst>(V)) { - // Verify it's a B->A cast. Type *TyB = BCI->getOperand(0)->getType(); Type *TyA = BCI->getType(); - if (TyA == DestTy && TyB == SrcTy) { - Instruction *I = replaceInstUsesWith(*BCI, NewPN); - if (BCI == &CI) - RetVal = I; - } + assert(TyA == DestTy && TyB == SrcTy); + (void) TyA; + (void) TyB; + Instruction *I = replaceInstUsesWith(*BCI, NewPN); + if (BCI == &CI) + RetVal = I; + } else if (auto *PHI = dyn_cast<PHINode>(V)) { + assert(OldPhiNodes.count(PHI) > 0); + (void) PHI; + } else { + llvm_unreachable("all uses should be handled"); } } } @@ -2338,8 +2435,23 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { // If we found a path from the src to dest, create the getelementptr now. if (SrcElTy == DstElTy) { SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0)); - return GetElementPtrInst::CreateInBounds(SrcPTy->getElementType(), Src, - Idxs); + GetElementPtrInst *GEP = + GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs); + + // If the source pointer is dereferenceable, then assume it points to an + // allocated object and apply "inbounds" to the GEP. + bool CanBeNull; + if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) { + // In a non-default address space (not 0), a null pointer can not be + // assumed inbounds, so ignore that case (dereferenceable_or_null). + // The reason is that 'null' is not treated differently in these address + // spaces, and we consequently ignore the 'gep inbounds' special case + // for 'null' which allows 'inbounds' on 'null' if the indices are + // zeros. + if (SrcPTy->getAddressSpace() == 0 || !CanBeNull) + GEP->setIsInBounds(); + } + return GEP; } } @@ -2359,8 +2471,8 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { CastInst *SrcCast = cast<CastInst>(Src); if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0))) if (isa<VectorType>(BCIn->getOperand(0)->getType())) - if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0), - cast<VectorType>(DestTy), *this)) + if (Instruction *I = optimizeVectorResizeWithIntegerBitCasts( + BCIn->getOperand(0), cast<VectorType>(DestTy), *this)) return I; } @@ -2391,28 +2503,47 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) { } } - if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) { + if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) { // Okay, we have (bitcast (shuffle ..)). Check to see if this is // a bitcast to a vector with the same # elts. - if (SVI->hasOneUse() && DestTy->isVectorTy() && - DestTy->getVectorNumElements() == SVI->getType()->getNumElements() && - SVI->getType()->getNumElements() == - SVI->getOperand(0)->getType()->getVectorNumElements()) { + Value *ShufOp0 = Shuf->getOperand(0); + Value *ShufOp1 = Shuf->getOperand(1); + unsigned NumShufElts = Shuf->getType()->getVectorNumElements(); + unsigned NumSrcVecElts = ShufOp0->getType()->getVectorNumElements(); + if (Shuf->hasOneUse() && DestTy->isVectorTy() && + DestTy->getVectorNumElements() == NumShufElts && + NumShufElts == NumSrcVecElts) { BitCastInst *Tmp; // If either of the operands is a cast from CI.getType(), then // evaluating the shuffle in the casted destination's type will allow // us to eliminate at least one cast. - if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) && + if (((Tmp = dyn_cast<BitCastInst>(ShufOp0)) && Tmp->getOperand(0)->getType() == DestTy) || - ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) && + ((Tmp = dyn_cast<BitCastInst>(ShufOp1)) && Tmp->getOperand(0)->getType() == DestTy)) { - Value *LHS = Builder.CreateBitCast(SVI->getOperand(0), DestTy); - Value *RHS = Builder.CreateBitCast(SVI->getOperand(1), DestTy); + Value *LHS = Builder.CreateBitCast(ShufOp0, DestTy); + Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy); // Return a new shuffle vector. Use the same element ID's, as we // know the vector types match #elts. - return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2)); + return new ShuffleVectorInst(LHS, RHS, Shuf->getOperand(2)); } } + + // A bitcasted-to-scalar and byte-reversing shuffle is better recognized as + // a byte-swap: + // bitcast <N x i8> (shuf X, undef, <N, N-1,...0>) --> bswap (bitcast X) + // TODO: We should match the related pattern for bitreverse. + if (DestTy->isIntegerTy() && + DL.isLegalInteger(DestTy->getScalarSizeInBits()) && + SrcTy->getScalarSizeInBits() == 8 && NumShufElts % 2 == 0 && + Shuf->hasOneUse() && Shuf->isReverse()) { + assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask"); + assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op"); + Function *Bswap = + Intrinsic::getDeclaration(CI.getModule(), Intrinsic::bswap, DestTy); + Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy); + return IntrinsicInst::Create(Bswap, { ScalarX }); + } } // Handle the A->B->A cast, and there is an intervening PHI node. diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 0ece3299754e..f38dc436722d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -69,34 +69,6 @@ static bool hasBranchUse(ICmpInst &I) { return false; } -/// Given an exploded icmp instruction, return true if the comparison only -/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the -/// result of the comparison is true when the input value is signed. -static bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS, - bool &TrueIfSigned) { - switch (Pred) { - case ICmpInst::ICMP_SLT: // True if LHS s< 0 - TrueIfSigned = true; - return RHS.isNullValue(); - case ICmpInst::ICMP_SLE: // True if LHS s<= RHS and RHS == -1 - TrueIfSigned = true; - return RHS.isAllOnesValue(); - case ICmpInst::ICMP_SGT: // True if LHS s> -1 - TrueIfSigned = false; - return RHS.isAllOnesValue(); - case ICmpInst::ICMP_UGT: - // True if LHS u> RHS and RHS == high-bit-mask - 1 - TrueIfSigned = true; - return RHS.isMaxSignedValue(); - case ICmpInst::ICMP_UGE: - // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc) - TrueIfSigned = true; - return RHS.isSignMask(); - default: - return false; - } -} - /// Returns true if the exploded icmp can be expressed as a signed comparison /// to zero and updates the predicate accordingly. /// The signedness of the comparison is preserved. @@ -900,6 +872,37 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, Offset = EmitGEPOffset(GEPLHS); return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset, Constant::getNullValue(Offset->getType())); + } + + if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) && + isa<Constant>(RHS) && cast<Constant>(RHS)->isNullValue() && + !NullPointerIsDefined(I.getFunction(), + RHS->getType()->getPointerAddressSpace())) { + // For most address spaces, an allocation can't be placed at null, but null + // itself is treated as a 0 size allocation in the in bounds rules. Thus, + // the only valid inbounds address derived from null, is null itself. + // Thus, we have four cases to consider: + // 1) Base == nullptr, Offset == 0 -> inbounds, null + // 2) Base == nullptr, Offset != 0 -> poison as the result is out of bounds + // 3) Base != nullptr, Offset == (-base) -> poison (crossing allocations) + // 4) Base != nullptr, Offset != (-base) -> nonnull (and possibly poison) + // + // (Note if we're indexing a type of size 0, that simply collapses into one + // of the buckets above.) + // + // In general, we're allowed to make values less poison (i.e. remove + // sources of full UB), so in this case, we just select between the two + // non-poison cases (1 and 4 above). + // + // For vectors, we apply the same reasoning on a per-lane basis. + auto *Base = GEPLHS->getPointerOperand(); + if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) { + int NumElts = GEPLHS->getType()->getVectorNumElements(); + Base = Builder.CreateVectorSplat(NumElts, Base); + } + return new ICmpInst(Cond, Base, + ConstantExpr::getPointerBitCastOrAddrSpaceCast( + cast<Constant>(RHS), Base->getType())); } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) { // If the base pointers are different, but the indices are the same, just // compare the base pointer. @@ -957,12 +960,14 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS, } // If one of the GEPs has all zero indices, recurse. - if (GEPLHS->hasAllZeroIndices()) + // FIXME: Handle vector of pointers. + if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices()) return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0), ICmpInst::getSwappedPredicate(Cond), I); // If the other GEP has all zero indices, recurse. - if (GEPRHS->hasAllZeroIndices()) + // FIXME: Handle vector of pointers. + if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices()) return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I); bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds(); @@ -1330,6 +1335,59 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, return ExtractValueInst::Create(Call, 1, "sadd.overflow"); } +/// If we have: +/// icmp eq/ne (urem/srem %x, %y), 0 +/// iff %y is a power-of-two, we can replace this with a bit test: +/// icmp eq/ne (and %x, (add %y, -1)), 0 +Instruction *InstCombiner::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) { + // This fold is only valid for equality predicates. + if (!I.isEquality()) + return nullptr; + ICmpInst::Predicate Pred; + Value *X, *Y, *Zero; + if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))), + m_CombineAnd(m_Zero(), m_Value(Zero))))) + return nullptr; + if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I)) + return nullptr; + // This may increase instruction count, we don't enforce that Y is a constant. + Value *Mask = Builder.CreateAdd(Y, Constant::getAllOnesValue(Y->getType())); + Value *Masked = Builder.CreateAnd(X, Mask); + return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero); +} + +/// Fold equality-comparison between zero and any (maybe truncated) right-shift +/// by one-less-than-bitwidth into a sign test on the original value. +Instruction *InstCombiner::foldSignBitTest(ICmpInst &I) { + Instruction *Val; + ICmpInst::Predicate Pred; + if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero()))) + return nullptr; + + Value *X; + Type *XTy; + + Constant *C; + if (match(Val, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))))) { + XTy = X->getType(); + unsigned XBitWidth = XTy->getScalarSizeInBits(); + if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(XBitWidth, XBitWidth - 1)))) + return nullptr; + } else if (isa<BinaryOperator>(Val) && + (X = reassociateShiftAmtsOfTwoSameDirectionShifts( + cast<BinaryOperator>(Val), SQ.getWithInstruction(Val), + /*AnalyzeForSignBitExtraction=*/true))) { + XTy = X->getType(); + } else + return nullptr; + + return ICmpInst::Create(Instruction::ICmp, + Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE + : ICmpInst::ICMP_SLT, + X, ConstantInt::getNullValue(XTy)); +} + // Handle icmp pred X, 0 Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) { CmpInst::Predicate Pred = Cmp.getPredicate(); @@ -1348,6 +1406,9 @@ Instruction *InstCombiner::foldICmpWithZero(ICmpInst &Cmp) { } } + if (Instruction *New = foldIRemByPowerOfTwoToBitTest(Cmp)) + return New; + // Given: // icmp eq/ne (urem %x, %y), 0 // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem': @@ -2192,6 +2253,44 @@ Instruction *InstCombiner::foldICmpShrConstant(ICmpInst &Cmp, return nullptr; } +Instruction *InstCombiner::foldICmpSRemConstant(ICmpInst &Cmp, + BinaryOperator *SRem, + const APInt &C) { + // Match an 'is positive' or 'is negative' comparison of remainder by a + // constant power-of-2 value: + // (X % pow2C) sgt/slt 0 + const ICmpInst::Predicate Pred = Cmp.getPredicate(); + if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT) + return nullptr; + + // TODO: The one-use check is standard because we do not typically want to + // create longer instruction sequences, but this might be a special-case + // because srem is not good for analysis or codegen. + if (!SRem->hasOneUse()) + return nullptr; + + const APInt *DivisorC; + if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC))) + return nullptr; + + // Mask off the sign bit and the modulo bits (low-bits). + Type *Ty = SRem->getType(); + APInt SignMask = APInt::getSignMask(Ty->getScalarSizeInBits()); + Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1)); + Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC); + + // For 'is positive?' check that the sign-bit is clear and at least 1 masked + // bit is set. Example: + // (i8 X % 32) s> 0 --> (X & 159) s> 0 + if (Pred == ICmpInst::ICMP_SGT) + return new ICmpInst(ICmpInst::ICMP_SGT, And, ConstantInt::getNullValue(Ty)); + + // For 'is negative?' check that the sign-bit is set and at least 1 masked + // bit is set. Example: + // (i16 X % 4) s< 0 --> (X & 32771) u> 32768 + return new ICmpInst(ICmpInst::ICMP_UGT, And, ConstantInt::get(Ty, SignMask)); +} + /// Fold icmp (udiv X, Y), C. Instruction *InstCombiner::foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv, @@ -2400,6 +2499,11 @@ Instruction *InstCombiner::foldICmpSubConstant(ICmpInst &Cmp, const APInt *C2; APInt SubResult; + // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0 + if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality()) + return new ICmpInst(Cmp.getPredicate(), Y, + ConstantInt::get(Y->getType(), 0)); + // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C) if (match(X, m_APInt(C2)) && ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) || @@ -2462,9 +2566,6 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp, Type *Ty = Add->getType(); CmpInst::Predicate Pred = Cmp.getPredicate(); - if (!Add->hasOneUse()) - return nullptr; - // If the add does not wrap, we can always adjust the compare by subtracting // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE // are canonicalized to SGT/SLT/UGT/ULT. @@ -2498,6 +2599,9 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp, return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower)); } + if (!Add->hasOneUse()) + return nullptr; + // X+C <u C2 -> (X & -C2) == C // iff C & (C2-1) == 0 // C2 is a power of 2 @@ -2522,20 +2626,49 @@ bool InstCombiner::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, // TODO: Generalize this to work with other comparison idioms or ensure // they get canonicalized into this form. - // select i1 (a == b), i32 Equal, i32 (select i1 (a < b), i32 Less, i32 - // Greater), where Equal, Less and Greater are placeholders for any three - // constants. - ICmpInst::Predicate PredA, PredB; - if (match(SI->getTrueValue(), m_ConstantInt(Equal)) && - match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) && - PredA == ICmpInst::ICMP_EQ && - match(SI->getFalseValue(), - m_Select(m_ICmp(PredB, m_Specific(LHS), m_Specific(RHS)), - m_ConstantInt(Less), m_ConstantInt(Greater))) && - PredB == ICmpInst::ICMP_SLT) { - return true; + // select i1 (a == b), + // i32 Equal, + // i32 (select i1 (a < b), i32 Less, i32 Greater) + // where Equal, Less and Greater are placeholders for any three constants. + ICmpInst::Predicate PredA; + if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) || + !ICmpInst::isEquality(PredA)) + return false; + Value *EqualVal = SI->getTrueValue(); + Value *UnequalVal = SI->getFalseValue(); + // We still can get non-canonical predicate here, so canonicalize. + if (PredA == ICmpInst::ICMP_NE) + std::swap(EqualVal, UnequalVal); + if (!match(EqualVal, m_ConstantInt(Equal))) + return false; + ICmpInst::Predicate PredB; + Value *LHS2, *RHS2; + if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)), + m_ConstantInt(Less), m_ConstantInt(Greater)))) + return false; + // We can get predicate mismatch here, so canonicalize if possible: + // First, ensure that 'LHS' match. + if (LHS2 != LHS) { + // x sgt y <--> y slt x + std::swap(LHS2, RHS2); + PredB = ICmpInst::getSwappedPredicate(PredB); + } + if (LHS2 != LHS) + return false; + // We also need to canonicalize 'RHS'. + if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) { + // x sgt C-1 <--> x sge C <--> not(x slt C) + auto FlippedStrictness = + getFlippedStrictnessPredicateAndConstant(PredB, cast<Constant>(RHS2)); + if (!FlippedStrictness) + return false; + assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check"); + RHS2 = FlippedStrictness->second; + // And kind-of perform the result swap. + std::swap(Less, Greater); + PredB = ICmpInst::ICMP_SLT; } - return false; + return PredB == ICmpInst::ICMP_SLT && RHS == RHS2; } Instruction *InstCombiner::foldICmpSelectConstant(ICmpInst &Cmp, @@ -2715,6 +2848,10 @@ Instruction *InstCombiner::foldICmpInstWithConstant(ICmpInst &Cmp) { if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C)) return I; break; + case Instruction::SRem: + if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C)) + return I; + break; case Instruction::UDiv: if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C)) return I; @@ -2939,6 +3076,28 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp, } break; } + + case Intrinsic::uadd_sat: { + // uadd.sat(a, b) == 0 -> (a | b) == 0 + if (C.isNullValue()) { + Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1)); + return replaceInstUsesWith(Cmp, Builder.CreateICmp( + Cmp.getPredicate(), Or, Constant::getNullValue(Ty))); + + } + break; + } + + case Intrinsic::usub_sat: { + // usub.sat(a, b) == 0 -> a <= b + if (C.isNullValue()) { + ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ + ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT; + return ICmpInst::Create(Instruction::ICmp, NewPred, + II->getArgOperand(0), II->getArgOperand(1)); + } + break; + } default: break; } @@ -3205,6 +3364,23 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I, llvm_unreachable("All possible folds are handled."); } + // The mask value may be a vector constant that has undefined elements. But it + // may not be safe to propagate those undefs into the new compare, so replace + // those elements by copying an existing, defined, and safe scalar constant. + Type *OpTy = M->getType(); + auto *VecC = dyn_cast<Constant>(M); + if (OpTy->isVectorTy() && VecC && VecC->containsUndefElement()) { + Constant *SafeReplacementConstant = nullptr; + for (unsigned i = 0, e = OpTy->getVectorNumElements(); i != e; ++i) { + if (!isa<UndefValue>(VecC->getAggregateElement(i))) { + SafeReplacementConstant = VecC->getAggregateElement(i); + break; + } + } + assert(SafeReplacementConstant && "Failed to find undef replacement"); + M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant); + } + return Builder.CreateICmp(DstPred, X, M); } @@ -3288,6 +3464,7 @@ foldICmpWithTruncSignExtendedVal(ICmpInst &I, // we should move shifts to the same hand of 'and', i.e. rewrite as // icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x) // We are only interested in opposite logical shifts here. +// One of the shifts can be truncated. // If we can, we want to end up creating 'lshr' shift. static Value * foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, @@ -3297,18 +3474,32 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, return nullptr; auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value()); - auto m_AnyLShr = m_LShr(m_Value(), m_Value()); - - // Look for an 'and' of two (opposite) logical shifts. - // Pick the single-use shift as XShift. - Instruction *XShift, *YShift; - if (!match(I.getOperand(0), - m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)), - m_CombineAnd(m_AnyLogicalShift, m_Instruction(YShift))))) + + // Look for an 'and' of two logical shifts, one of which may be truncated. + // We use m_TruncOrSelf() on the RHS to correctly handle commutative case. + Instruction *XShift, *MaybeTruncation, *YShift; + if (!match( + I.getOperand(0), + m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)), + m_CombineAnd(m_TruncOrSelf(m_CombineAnd( + m_AnyLogicalShift, m_Instruction(YShift))), + m_Instruction(MaybeTruncation))))) return nullptr; + // We potentially looked past 'trunc', but only when matching YShift, + // therefore YShift must have the widest type. + Instruction *WidestShift = YShift; + // Therefore XShift must have the shallowest type. + // Or they both have identical types if there was no truncation. + Instruction *NarrowestShift = XShift; + + Type *WidestTy = WidestShift->getType(); + assert(NarrowestShift->getType() == I.getOperand(0)->getType() && + "We did not look past any shifts while matching XShift though."); + bool HadTrunc = WidestTy != I.getOperand(0)->getType(); + // If YShift is a 'lshr', swap the shifts around. - if (match(YShift, m_AnyLShr)) + if (match(YShift, m_LShr(m_Value(), m_Value()))) std::swap(XShift, YShift); // The shifts must be in opposite directions. @@ -3317,44 +3508,181 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ, return nullptr; // Do not care about same-direction shifts here. Value *X, *XShAmt, *Y, *YShAmt; - match(XShift, m_BinOp(m_Value(X), m_Value(XShAmt))); - match(YShift, m_BinOp(m_Value(Y), m_Value(YShAmt))); + match(XShift, m_BinOp(m_Value(X), m_ZExtOrSelf(m_Value(XShAmt)))); + match(YShift, m_BinOp(m_Value(Y), m_ZExtOrSelf(m_Value(YShAmt)))); // If one of the values being shifted is a constant, then we will end with - // and+icmp, and shift instr will be constant-folded. If they are not, + // and+icmp, and [zext+]shift instrs will be constant-folded. If they are not, // however, we will need to ensure that we won't increase instruction count. if (!isa<Constant>(X) && !isa<Constant>(Y)) { // At least one of the hands of the 'and' should be one-use shift. if (!match(I.getOperand(0), m_c_And(m_OneUse(m_AnyLogicalShift), m_Value()))) return nullptr; + if (HadTrunc) { + // Due to the 'trunc', we will need to widen X. For that either the old + // 'trunc' or the shift amt in the non-truncated shift should be one-use. + if (!MaybeTruncation->hasOneUse() && + !NarrowestShift->getOperand(1)->hasOneUse()) + return nullptr; + } } + // We have two shift amounts from two different shifts. The types of those + // shift amounts may not match. If that's the case let's bailout now. + if (XShAmt->getType() != YShAmt->getType()) + return nullptr; + // Can we fold (XShAmt+YShAmt) ? - Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, XShAmt, YShAmt, - SQ.getWithInstruction(&I)); + auto *NewShAmt = dyn_cast_or_null<Constant>( + SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false, + /*isNUW=*/false, SQ.getWithInstruction(&I))); if (!NewShAmt) return nullptr; + NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy); + unsigned WidestBitWidth = WidestTy->getScalarSizeInBits(); + // Is the new shift amount smaller than the bit width? // FIXME: could also rely on ConstantRange. - unsigned BitWidth = X->getType()->getScalarSizeInBits(); - if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, - APInt(BitWidth, BitWidth)))) + if (!match(NewShAmt, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, + APInt(WidestBitWidth, WidestBitWidth)))) return nullptr; - // All good, we can do this fold. The shift is the same that was for X. + + // An extra legality check is needed if we had trunc-of-lshr. + if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) { + auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ, + WidestShift]() { + // It isn't obvious whether it's worth it to analyze non-constants here. + // Also, let's basically give up on non-splat cases, pessimizing vectors. + // If *any* of these preconditions matches we can perform the fold. + Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy() + ? NewShAmt->getSplatValue() + : NewShAmt; + // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold. + if (NewShAmtSplat && + (NewShAmtSplat->isNullValue() || + NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1)) + return true; + // We consider *min* leading zeros so a single outlier + // blocks the transform as opposed to allowing it. + if (auto *C = dyn_cast<Constant>(NarrowestShift->getOperand(0))) { + KnownBits Known = computeKnownBits(C, SQ.DL); + unsigned MinLeadZero = Known.countMinLeadingZeros(); + // If the value being shifted has at most lowest bit set we can fold. + unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero; + if (MaxActiveBits <= 1) + return true; + // Precondition: NewShAmt u<= countLeadingZeros(C) + if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero)) + return true; + } + if (auto *C = dyn_cast<Constant>(WidestShift->getOperand(0))) { + KnownBits Known = computeKnownBits(C, SQ.DL); + unsigned MinLeadZero = Known.countMinLeadingZeros(); + // If the value being shifted has at most lowest bit set we can fold. + unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero; + if (MaxActiveBits <= 1) + return true; + // Precondition: ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C) + if (NewShAmtSplat) { + APInt AdjNewShAmt = + (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger(); + if (AdjNewShAmt.ule(MinLeadZero)) + return true; + } + } + return false; // Can't tell if it's ok. + }; + if (!CanFold()) + return nullptr; + } + + // All good, we can do this fold. + X = Builder.CreateZExt(X, WidestTy); + Y = Builder.CreateZExt(Y, WidestTy); + // The shift is the same that was for X. Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr ? Builder.CreateLShr(X, NewShAmt) : Builder.CreateShl(X, NewShAmt); Value *T1 = Builder.CreateAnd(T0, Y); return Builder.CreateICmp(I.getPredicate(), T1, - Constant::getNullValue(X->getType())); + Constant::getNullValue(WidestTy)); +} + +/// Fold +/// (-1 u/ x) u< y +/// ((x * y) u/ x) != y +/// to +/// @llvm.umul.with.overflow(x, y) plus extraction of overflow bit +/// Note that the comparison is commutative, while inverted (u>=, ==) predicate +/// will mean that we are looking for the opposite answer. +Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) { + ICmpInst::Predicate Pred; + Value *X, *Y; + Instruction *Mul; + bool NeedNegation; + // Look for: (-1 u/ x) u</u>= y + if (!I.isEquality() && + match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))), + m_Value(Y)))) { + Mul = nullptr; + // Canonicalize as-if y was on RHS. + if (I.getOperand(1) != Y) + Pred = I.getSwappedPredicate(); + + // Are we checking that overflow does not happen, or does happen? + switch (Pred) { + case ICmpInst::Predicate::ICMP_ULT: + NeedNegation = false; + break; // OK + case ICmpInst::Predicate::ICMP_UGE: + NeedNegation = true; + break; // OK + default: + return nullptr; // Wrong predicate. + } + } else // Look for: ((x * y) u/ x) !=/== y + if (I.isEquality() && + match(&I, m_c_ICmp(Pred, m_Value(Y), + m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y), + m_Value(X)), + m_Instruction(Mul)), + m_Deferred(X)))))) { + NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ; + } else + return nullptr; + + BuilderTy::InsertPointGuard Guard(Builder); + // If the pattern included (x * y), we'll want to insert new instructions + // right before that original multiplication so that we can replace it. + bool MulHadOtherUses = Mul && !Mul->hasOneUse(); + if (MulHadOtherUses) + Builder.SetInsertPoint(Mul); + + Function *F = Intrinsic::getDeclaration( + I.getModule(), Intrinsic::umul_with_overflow, X->getType()); + CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul"); + + // If the multiplication was used elsewhere, to ensure that we don't leave + // "duplicate" instructions, replace uses of that original multiplication + // with the multiplication result from the with.overflow intrinsic. + if (MulHadOtherUses) + replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val")); + + Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov"); + if (NeedNegation) // This technically increases instruction count. + Res = Builder.CreateNot(Res, "umul.not.ov"); + + return Res; } /// Try to fold icmp (binop), X or icmp X, (binop). /// TODO: A large part of this logic is duplicated in InstSimplify's /// simplifyICmpWithBinOp(). We should be able to share that and avoid the code /// duplication. -Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { +Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I, const SimplifyQuery &SQ) { + const SimplifyQuery Q = SQ.getWithInstruction(&I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); // Special logic for binary operators. @@ -3367,13 +3695,13 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { Value *X; // Convert add-with-unsigned-overflow comparisons into a 'not' with compare. - // (Op1 + X) <u Op1 --> ~Op1 <u X - // Op0 >u (Op0 + X) --> X >u ~Op0 + // (Op1 + X) u</u>= Op1 --> ~Op1 u</u>= X if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) && - Pred == ICmpInst::ICMP_ULT) + (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) return new ICmpInst(Pred, Builder.CreateNot(Op1), X); + // Op0 u>/u<= (Op0 + X) --> X u>/u<= ~Op0 if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) && - Pred == ICmpInst::ICMP_UGT) + (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) return new ICmpInst(Pred, X, Builder.CreateNot(Op0)); bool NoOp0WrapProblem = false, NoOp1WrapProblem = false; @@ -3400,21 +3728,21 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { D = BO1->getOperand(1); } - // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow. + // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow. + // icmp (A+B), B -> icmp A, 0 for equalities or if there is no overflow. if ((A == Op1 || B == Op1) && NoOp0WrapProblem) return new ICmpInst(Pred, A == Op1 ? B : A, Constant::getNullValue(Op1->getType())); - // icmp X, (X+Y) -> icmp 0, Y for equalities or if there is no overflow. + // icmp C, (C+D) -> icmp 0, D for equalities or if there is no overflow. + // icmp D, (C+D) -> icmp 0, C for equalities or if there is no overflow. if ((C == Op0 || D == Op0) && NoOp1WrapProblem) return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()), C == Op0 ? D : C); - // icmp (X+Y), (X+Z) -> icmp Y, Z for equalities or if there is no overflow. + // icmp (A+B), (A+D) -> icmp B, D for equalities or if there is no overflow. if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem && - NoOp1WrapProblem && - // Try not to increase register pressure. - BO0->hasOneUse() && BO1->hasOneUse()) { + NoOp1WrapProblem) { // Determine Y and Z in the form icmp (X+Y), (X+Z). Value *Y, *Z; if (A == C) { @@ -3438,39 +3766,39 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { return new ICmpInst(Pred, Y, Z); } - // icmp slt (X + -1), Y -> icmp sle X, Y + // icmp slt (A + -1), Op1 -> icmp sle A, Op1 if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT && match(B, m_AllOnes())) return new ICmpInst(CmpInst::ICMP_SLE, A, Op1); - // icmp sge (X + -1), Y -> icmp sgt X, Y + // icmp sge (A + -1), Op1 -> icmp sgt A, Op1 if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE && match(B, m_AllOnes())) return new ICmpInst(CmpInst::ICMP_SGT, A, Op1); - // icmp sle (X + 1), Y -> icmp slt X, Y + // icmp sle (A + 1), Op1 -> icmp slt A, Op1 if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One())) return new ICmpInst(CmpInst::ICMP_SLT, A, Op1); - // icmp sgt (X + 1), Y -> icmp sge X, Y + // icmp sgt (A + 1), Op1 -> icmp sge A, Op1 if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One())) return new ICmpInst(CmpInst::ICMP_SGE, A, Op1); - // icmp sgt X, (Y + -1) -> icmp sge X, Y + // icmp sgt Op0, (C + -1) -> icmp sge Op0, C if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT && match(D, m_AllOnes())) return new ICmpInst(CmpInst::ICMP_SGE, Op0, C); - // icmp sle X, (Y + -1) -> icmp slt X, Y + // icmp sle Op0, (C + -1) -> icmp slt Op0, C if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE && match(D, m_AllOnes())) return new ICmpInst(CmpInst::ICMP_SLT, Op0, C); - // icmp sge X, (Y + 1) -> icmp sgt X, Y + // icmp sge Op0, (C + 1) -> icmp sgt Op0, C if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One())) return new ICmpInst(CmpInst::ICMP_SGT, Op0, C); - // icmp slt X, (Y + 1) -> icmp sle X, Y + // icmp slt Op0, (C + 1) -> icmp sle Op0, C if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One())) return new ICmpInst(CmpInst::ICMP_SLE, Op0, C); @@ -3478,33 +3806,33 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { // canonicalization from (X -nuw 1) to (X + -1) means that the combinations // wouldn't happen even if they were implemented. // - // icmp ult (X - 1), Y -> icmp ule X, Y - // icmp uge (X - 1), Y -> icmp ugt X, Y - // icmp ugt X, (Y - 1) -> icmp uge X, Y - // icmp ule X, (Y - 1) -> icmp ult X, Y + // icmp ult (A - 1), Op1 -> icmp ule A, Op1 + // icmp uge (A - 1), Op1 -> icmp ugt A, Op1 + // icmp ugt Op0, (C - 1) -> icmp uge Op0, C + // icmp ule Op0, (C - 1) -> icmp ult Op0, C - // icmp ule (X + 1), Y -> icmp ult X, Y + // icmp ule (A + 1), Op0 -> icmp ult A, Op1 if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One())) return new ICmpInst(CmpInst::ICMP_ULT, A, Op1); - // icmp ugt (X + 1), Y -> icmp uge X, Y + // icmp ugt (A + 1), Op0 -> icmp uge A, Op1 if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One())) return new ICmpInst(CmpInst::ICMP_UGE, A, Op1); - // icmp uge X, (Y + 1) -> icmp ugt X, Y + // icmp uge Op0, (C + 1) -> icmp ugt Op0, C if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One())) return new ICmpInst(CmpInst::ICMP_UGT, Op0, C); - // icmp ult X, (Y + 1) -> icmp ule X, Y + // icmp ult Op0, (C + 1) -> icmp ule Op0, C if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One())) return new ICmpInst(CmpInst::ICMP_ULE, Op0, C); // if C1 has greater magnitude than C2: - // icmp (X + C1), (Y + C2) -> icmp (X + C3), Y + // icmp (A + C1), (C + C2) -> icmp (A + C3), C // s.t. C3 = C1 - C2 // // if C2 has greater magnitude than C1: - // icmp (X + C1), (Y + C2) -> icmp X, (Y + C3) + // icmp (A + C1), (C + C2) -> icmp A, (C + C3) // s.t. C3 = C2 - C1 if (A && C && NoOp0WrapProblem && NoOp1WrapProblem && (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned()) @@ -3542,29 +3870,35 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { D = BO1->getOperand(1); } - // icmp (X-Y), X -> icmp 0, Y for equalities or if there is no overflow. + // icmp (A-B), A -> icmp 0, B for equalities or if there is no overflow. if (A == Op1 && NoOp0WrapProblem) return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B); - // icmp X, (X-Y) -> icmp Y, 0 for equalities or if there is no overflow. + // icmp C, (C-D) -> icmp D, 0 for equalities or if there is no overflow. if (C == Op0 && NoOp1WrapProblem) return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType())); - // (A - B) >u A --> A <u B - if (A == Op1 && Pred == ICmpInst::ICMP_UGT) - return new ICmpInst(ICmpInst::ICMP_ULT, A, B); - // C <u (C - D) --> C <u D - if (C == Op0 && Pred == ICmpInst::ICMP_ULT) - return new ICmpInst(ICmpInst::ICMP_ULT, C, D); - - // icmp (Y-X), (Z-X) -> icmp Y, Z for equalities or if there is no overflow. - if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem && - // Try not to increase register pressure. - BO0->hasOneUse() && BO1->hasOneUse()) + // Convert sub-with-unsigned-overflow comparisons into a comparison of args. + // (A - B) u>/u<= A --> B u>/u<= A + if (A == Op1 && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE)) + return new ICmpInst(Pred, B, A); + // C u</u>= (C - D) --> C u</u>= D + if (C == Op0 && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) + return new ICmpInst(Pred, C, D); + // (A - B) u>=/u< A --> B u>/u<= A iff B != 0 + if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) && + isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT)) + return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A); + // C u<=/u> (C - D) --> C u</u>= D iff B != 0 + if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) && + isKnownNonZero(D, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT)) + return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D); + + // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow. + if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem) return new ICmpInst(Pred, A, C); - // icmp (X-Y), (X-Z) -> icmp Z, Y for equalities or if there is no overflow. - if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem && - // Try not to increase register pressure. - BO0->hasOneUse() && BO1->hasOneUse()) + + // icmp (A-B), (A-D) -> icmp D, B for equalities or if there is no overflow. + if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem) return new ICmpInst(Pred, D, B); // icmp (0-X) < cst --> x > -cst @@ -3699,6 +4033,9 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { } } + if (Value *V = foldUnsignedMultiplicationOverflowCheck(I)) + return replaceInstUsesWith(I, V); + if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder)) return replaceInstUsesWith(I, V); @@ -3975,125 +4312,140 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) { return nullptr; } -/// Handle icmp (cast x to y), (cast/cst). We only handle extending casts so -/// far. -Instruction *InstCombiner::foldICmpWithCastAndCast(ICmpInst &ICmp) { - const CastInst *LHSCI = cast<CastInst>(ICmp.getOperand(0)); - Value *LHSCIOp = LHSCI->getOperand(0); - Type *SrcTy = LHSCIOp->getType(); - Type *DestTy = LHSCI->getType(); - - // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the - // integer type is the same size as the pointer type. - const auto& CompatibleSizes = [&](Type* SrcTy, Type* DestTy) -> bool { - if (isa<VectorType>(SrcTy)) { - SrcTy = cast<VectorType>(SrcTy)->getElementType(); - DestTy = cast<VectorType>(DestTy)->getElementType(); - } - return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth(); - }; - if (LHSCI->getOpcode() == Instruction::PtrToInt && - CompatibleSizes(SrcTy, DestTy)) { - Value *RHSOp = nullptr; - if (auto *RHSC = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) { - Value *RHSCIOp = RHSC->getOperand(0); - if (RHSCIOp->getType()->getPointerAddressSpace() == - LHSCIOp->getType()->getPointerAddressSpace()) { - RHSOp = RHSC->getOperand(0); - // If the pointer types don't match, insert a bitcast. - if (LHSCIOp->getType() != RHSOp->getType()) - RHSOp = Builder.CreateBitCast(RHSOp, LHSCIOp->getType()); - } - } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) { - RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy); - } - - if (RHSOp) - return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSOp); - } - - // The code below only handles extension cast instructions, so far. - // Enforce this. - if (LHSCI->getOpcode() != Instruction::ZExt && - LHSCI->getOpcode() != Instruction::SExt) +static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp, + InstCombiner::BuilderTy &Builder) { + assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0"); + auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0)); + Value *X; + if (!match(CastOp0, m_ZExtOrSExt(m_Value(X)))) return nullptr; - bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt; - bool isSignedCmp = ICmp.isSigned(); - - if (auto *CI = dyn_cast<CastInst>(ICmp.getOperand(1))) { - // Not an extension from the same type? - Value *RHSCIOp = CI->getOperand(0); - if (RHSCIOp->getType() != LHSCIOp->getType()) - return nullptr; - + bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt; + bool IsSignedCmp = ICmp.isSigned(); + if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) { // If the signedness of the two casts doesn't agree (i.e. one is a sext // and the other is a zext), then we can't handle this. - if (CI->getOpcode() != LHSCI->getOpcode()) + // TODO: This is too strict. We can handle some predicates (equality?). + if (CastOp0->getOpcode() != CastOp1->getOpcode()) return nullptr; - // Deal with equality cases early. + // Not an extension from the same type? + Value *Y = CastOp1->getOperand(0); + Type *XTy = X->getType(), *YTy = Y->getType(); + if (XTy != YTy) { + // One of the casts must have one use because we are creating a new cast. + if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse()) + return nullptr; + // Extend the narrower operand to the type of the wider operand. + if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits()) + X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy); + else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits()) + Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy); + else + return nullptr; + } + + // (zext X) == (zext Y) --> X == Y + // (sext X) == (sext Y) --> X == Y if (ICmp.isEquality()) - return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp); + return new ICmpInst(ICmp.getPredicate(), X, Y); // A signed comparison of sign extended values simplifies into a // signed comparison. - if (isSignedCmp && isSignedExt) - return new ICmpInst(ICmp.getPredicate(), LHSCIOp, RHSCIOp); + if (IsSignedCmp && IsSignedExt) + return new ICmpInst(ICmp.getPredicate(), X, Y); // The other three cases all fold into an unsigned comparison. - return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, RHSCIOp); + return new ICmpInst(ICmp.getUnsignedPredicate(), X, Y); } - // If we aren't dealing with a constant on the RHS, exit early. + // Below here, we are only folding a compare with constant. auto *C = dyn_cast<Constant>(ICmp.getOperand(1)); if (!C) return nullptr; // Compute the constant that would happen if we truncated to SrcTy then // re-extended to DestTy. + Type *SrcTy = CastOp0->getSrcTy(); + Type *DestTy = CastOp0->getDestTy(); Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy); - Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy); + Constant *Res2 = ConstantExpr::getCast(CastOp0->getOpcode(), Res1, DestTy); // If the re-extended constant didn't change... if (Res2 == C) { - // Deal with equality cases early. if (ICmp.isEquality()) - return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1); + return new ICmpInst(ICmp.getPredicate(), X, Res1); // A signed comparison of sign extended values simplifies into a // signed comparison. - if (isSignedExt && isSignedCmp) - return new ICmpInst(ICmp.getPredicate(), LHSCIOp, Res1); + if (IsSignedExt && IsSignedCmp) + return new ICmpInst(ICmp.getPredicate(), X, Res1); // The other three cases all fold into an unsigned comparison. - return new ICmpInst(ICmp.getUnsignedPredicate(), LHSCIOp, Res1); + return new ICmpInst(ICmp.getUnsignedPredicate(), X, Res1); } // The re-extended constant changed, partly changed (in the case of a vector), // or could not be determined to be equal (in the case of a constant // expression), so the constant cannot be represented in the shorter type. - // Consequently, we cannot emit a simple comparison. // All the cases that fold to true or false will have already been handled // by SimplifyICmpInst, so only deal with the tricky case. + if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C)) + return nullptr; - if (isSignedCmp || !isSignedExt || !isa<ConstantInt>(C)) + // Is source op positive? + // icmp ult (sext X), C --> icmp sgt X, -1 + if (ICmp.getPredicate() == ICmpInst::ICMP_ULT) + return new ICmpInst(CmpInst::ICMP_SGT, X, Constant::getAllOnesValue(SrcTy)); + + // Is source op negative? + // icmp ugt (sext X), C --> icmp slt X, 0 + assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!"); + return new ICmpInst(CmpInst::ICMP_SLT, X, Constant::getNullValue(SrcTy)); +} + +/// Handle icmp (cast x), (cast or constant). +Instruction *InstCombiner::foldICmpWithCastOp(ICmpInst &ICmp) { + auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0)); + if (!CastOp0) + return nullptr; + if (!isa<Constant>(ICmp.getOperand(1)) && !isa<CastInst>(ICmp.getOperand(1))) return nullptr; - // Evaluate the comparison for LT (we invert for GT below). LE and GE cases - // should have been folded away previously and not enter in here. + Value *Op0Src = CastOp0->getOperand(0); + Type *SrcTy = CastOp0->getSrcTy(); + Type *DestTy = CastOp0->getDestTy(); - // We're performing an unsigned comp with a sign extended value. - // This is true if the input is >= 0. [aka >s -1] - Constant *NegOne = Constant::getAllOnesValue(SrcTy); - Value *Result = Builder.CreateICmpSGT(LHSCIOp, NegOne, ICmp.getName()); + // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the + // integer type is the same size as the pointer type. + auto CompatibleSizes = [&](Type *SrcTy, Type *DestTy) { + if (isa<VectorType>(SrcTy)) { + SrcTy = cast<VectorType>(SrcTy)->getElementType(); + DestTy = cast<VectorType>(DestTy)->getElementType(); + } + return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth(); + }; + if (CastOp0->getOpcode() == Instruction::PtrToInt && + CompatibleSizes(SrcTy, DestTy)) { + Value *NewOp1 = nullptr; + if (auto *PtrToIntOp1 = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) { + Value *PtrSrc = PtrToIntOp1->getOperand(0); + if (PtrSrc->getType()->getPointerAddressSpace() == + Op0Src->getType()->getPointerAddressSpace()) { + NewOp1 = PtrToIntOp1->getOperand(0); + // If the pointer types don't match, insert a bitcast. + if (Op0Src->getType() != NewOp1->getType()) + NewOp1 = Builder.CreateBitCast(NewOp1, Op0Src->getType()); + } + } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) { + NewOp1 = ConstantExpr::getIntToPtr(RHSC, SrcTy); + } - // Finally, return the value computed. - if (ICmp.getPredicate() == ICmpInst::ICMP_ULT) - return replaceInstUsesWith(ICmp, Result); + if (NewOp1) + return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1); + } - assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!"); - return BinaryOperator::CreateNot(Result); + return foldICmpWithZextOrSext(ICmp, Builder); } static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) { @@ -4595,7 +4947,7 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { // Get scalar or pointer size. unsigned BitWidth = Ty->isIntOrIntVectorTy() ? Ty->getScalarSizeInBits() - : DL.getIndexTypeSizeInBits(Ty->getScalarType()); + : DL.getPointerTypeSizeInBits(Ty->getScalarType()); if (!BitWidth) return nullptr; @@ -4813,41 +5165,36 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) { return nullptr; } -/// If we have an icmp le or icmp ge instruction with a constant operand, turn -/// it into the appropriate icmp lt or icmp gt instruction. This transform -/// allows them to be folded in visitICmpInst. -static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { - ICmpInst::Predicate Pred = I.getPredicate(); - if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGE && - Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_UGE) - return nullptr; +llvm::Optional<std::pair<CmpInst::Predicate, Constant *>> +llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, + Constant *C) { + assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) && + "Only for relational integer predicates."); - Value *Op0 = I.getOperand(0); - Value *Op1 = I.getOperand(1); - auto *Op1C = dyn_cast<Constant>(Op1); - if (!Op1C) - return nullptr; + Type *Type = C->getType(); + bool IsSigned = ICmpInst::isSigned(Pred); + + CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred); + bool WillIncrement = + UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT; - // Check if the constant operand can be safely incremented/decremented without - // overflowing/underflowing. For scalars, SimplifyICmpInst has already handled - // the edge cases for us, so we just assert on them. For vectors, we must - // handle the edge cases. - Type *Op1Type = Op1->getType(); - bool IsSigned = I.isSigned(); - bool IsLE = (Pred == ICmpInst::ICMP_SLE || Pred == ICmpInst::ICMP_ULE); - auto *CI = dyn_cast<ConstantInt>(Op1C); - if (CI) { - // A <= MAX -> TRUE ; A >= MIN -> TRUE - assert(IsLE ? !CI->isMaxValue(IsSigned) : !CI->isMinValue(IsSigned)); - } else if (Op1Type->isVectorTy()) { - // TODO? If the edge cases for vectors were guaranteed to be handled as they - // are for scalar, we could remove the min/max checks. However, to do that, - // we would have to use insertelement/shufflevector to replace edge values. - unsigned NumElts = Op1Type->getVectorNumElements(); + // Check if the constant operand can be safely incremented/decremented + // without overflowing/underflowing. + auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) { + return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned); + }; + + Constant *SafeReplacementConstant = nullptr; + if (auto *CI = dyn_cast<ConstantInt>(C)) { + // Bail out if the constant can't be safely incremented/decremented. + if (!ConstantIsOk(CI)) + return llvm::None; + } else if (Type->isVectorTy()) { + unsigned NumElts = Type->getVectorNumElements(); for (unsigned i = 0; i != NumElts; ++i) { - Constant *Elt = Op1C->getAggregateElement(i); + Constant *Elt = C->getAggregateElement(i); if (!Elt) - return nullptr; + return llvm::None; if (isa<UndefValue>(Elt)) continue; @@ -4855,20 +5202,54 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { // Bail out if we can't determine if this constant is min/max or if we // know that this constant is min/max. auto *CI = dyn_cast<ConstantInt>(Elt); - if (!CI || (IsLE ? CI->isMaxValue(IsSigned) : CI->isMinValue(IsSigned))) - return nullptr; + if (!CI || !ConstantIsOk(CI)) + return llvm::None; + + if (!SafeReplacementConstant) + SafeReplacementConstant = CI; } } else { // ConstantExpr? - return nullptr; + return llvm::None; } - // Increment or decrement the constant and set the new comparison predicate: - // ULE -> ULT ; UGE -> UGT ; SLE -> SLT ; SGE -> SGT - Constant *OneOrNegOne = ConstantInt::get(Op1Type, IsLE ? 1 : -1, true); - CmpInst::Predicate NewPred = IsLE ? ICmpInst::ICMP_ULT: ICmpInst::ICMP_UGT; - NewPred = IsSigned ? ICmpInst::getSignedPredicate(NewPred) : NewPred; - return new ICmpInst(NewPred, Op0, ConstantExpr::getAdd(Op1C, OneOrNegOne)); + // It may not be safe to change a compare predicate in the presence of + // undefined elements, so replace those elements with the first safe constant + // that we found. + if (C->containsUndefElement()) { + assert(SafeReplacementConstant && "Replacement constant not set"); + C = Constant::replaceUndefsWith(C, SafeReplacementConstant); + } + + CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred); + + // Increment or decrement the constant. + Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true); + Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne); + + return std::make_pair(NewPred, NewC); +} + +/// If we have an icmp le or icmp ge instruction with a constant operand, turn +/// it into the appropriate icmp lt or icmp gt instruction. This transform +/// allows them to be folded in visitICmpInst. +static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) { + ICmpInst::Predicate Pred = I.getPredicate(); + if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) || + isCanonicalPredicate(Pred)) + return nullptr; + + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + auto *Op1C = dyn_cast<Constant>(Op1); + if (!Op1C) + return nullptr; + + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, Op1C); + if (!FlippedStrictness) + return nullptr; + + return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second); } /// Integer compare with boolean values can always be turned into bitwise ops. @@ -5022,8 +5403,39 @@ static Instruction *foldVectorCmp(CmpInst &Cmp, return nullptr; } +// extract(uadd.with.overflow(A, B), 0) ult A +// -> extract(uadd.with.overflow(A, B), 1) +static Instruction *foldICmpOfUAddOv(ICmpInst &I) { + CmpInst::Predicate Pred = I.getPredicate(); + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); + + Value *UAddOv; + Value *A, *B; + auto UAddOvResultPat = m_ExtractValue<0>( + m_Intrinsic<Intrinsic::uadd_with_overflow>(m_Value(A), m_Value(B))); + if (match(Op0, UAddOvResultPat) && + ((Pred == ICmpInst::ICMP_ULT && (Op1 == A || Op1 == B)) || + (Pred == ICmpInst::ICMP_EQ && match(Op1, m_ZeroInt()) && + (match(A, m_One()) || match(B, m_One()))) || + (Pred == ICmpInst::ICMP_NE && match(Op1, m_AllOnes()) && + (match(A, m_AllOnes()) || match(B, m_AllOnes()))))) + // extract(uadd.with.overflow(A, B), 0) < A + // extract(uadd.with.overflow(A, 1), 0) == 0 + // extract(uadd.with.overflow(A, -1), 0) != -1 + UAddOv = cast<ExtractValueInst>(Op0)->getAggregateOperand(); + else if (match(Op1, UAddOvResultPat) && + Pred == ICmpInst::ICMP_UGT && (Op0 == A || Op0 == B)) + // A > extract(uadd.with.overflow(A, B), 0) + UAddOv = cast<ExtractValueInst>(Op1)->getAggregateOperand(); + else + return nullptr; + + return ExtractValueInst::Create(UAddOv, 1); +} + Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { bool Changed = false; + const SimplifyQuery Q = SQ.getWithInstruction(&I); Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); unsigned Op0Cplxity = getComplexity(Op0); unsigned Op1Cplxity = getComplexity(Op1); @@ -5038,8 +5450,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { Changed = true; } - if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, - SQ.getWithInstruction(&I))) + if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q)) return replaceInstUsesWith(I, V); // Comparing -val or val with non-zero is the same as just comparing val @@ -5072,6 +5483,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldICmpWithDominatingICmp(I)) return Res; + if (Instruction *Res = foldICmpBinOp(I, Q)) + return Res; + if (Instruction *Res = foldICmpUsingKnownBits(I)) return Res; @@ -5120,6 +5534,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldICmpInstWithConstant(I)) return Res; + // Try to match comparison as a sign bit test. Intentionally do this after + // foldICmpInstWithConstant() to potentially let other folds to happen first. + if (Instruction *New = foldSignBitTest(I)) + return New; + if (Instruction *Res = foldICmpInstWithConstantNotInt(I)) return Res; @@ -5146,20 +5565,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldICmpBitCast(I, Builder)) return Res; - if (isa<CastInst>(Op0)) { - // Handle the special case of: icmp (cast bool to X), <cst> - // This comes up when you have code like - // int X = A < B; - // if (X) ... - // For generality, we handle any zero-extension of any operand comparison - // with a constant or another cast from the same type. - if (isa<Constant>(Op1) || isa<CastInst>(Op1)) - if (Instruction *R = foldICmpWithCastAndCast(I)) - return R; - } - - if (Instruction *Res = foldICmpBinOp(I)) - return Res; + if (Instruction *R = foldICmpWithCastOp(I)) + return R; if (Instruction *Res = foldICmpWithMinMax(I)) return Res; @@ -5214,6 +5621,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldICmpEquality(I)) return Res; + if (Instruction *Res = foldICmpOfUAddOv(I)) + return Res; + // The 'cmpxchg' instruction returns an aggregate containing the old value and // an i1 which indicates whether or not we successfully did the swap. // diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 434b0d591215..1a746cb87abb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -113,6 +113,48 @@ static inline bool isCanonicalPredicate(CmpInst::Predicate Pred) { } } +/// Given an exploded icmp instruction, return true if the comparison only +/// checks the sign bit. If it only checks the sign bit, set TrueIfSigned if the +/// result of the comparison is true when the input value is signed. +inline bool isSignBitCheck(ICmpInst::Predicate Pred, const APInt &RHS, + bool &TrueIfSigned) { + switch (Pred) { + case ICmpInst::ICMP_SLT: // True if LHS s< 0 + TrueIfSigned = true; + return RHS.isNullValue(); + case ICmpInst::ICMP_SLE: // True if LHS s<= -1 + TrueIfSigned = true; + return RHS.isAllOnesValue(); + case ICmpInst::ICMP_SGT: // True if LHS s> -1 + TrueIfSigned = false; + return RHS.isAllOnesValue(); + case ICmpInst::ICMP_SGE: // True if LHS s>= 0 + TrueIfSigned = false; + return RHS.isNullValue(); + case ICmpInst::ICMP_UGT: + // True if LHS u> RHS and RHS == sign-bit-mask - 1 + TrueIfSigned = true; + return RHS.isMaxSignedValue(); + case ICmpInst::ICMP_UGE: + // True if LHS u>= RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc) + TrueIfSigned = true; + return RHS.isMinSignedValue(); + case ICmpInst::ICMP_ULT: + // True if LHS u< RHS and RHS == sign-bit-mask (2^7, 2^15, 2^31, etc) + TrueIfSigned = false; + return RHS.isMinSignedValue(); + case ICmpInst::ICMP_ULE: + // True if LHS u<= RHS and RHS == sign-bit-mask - 1 + TrueIfSigned = false; + return RHS.isMaxSignedValue(); + default: + return false; + } +} + +llvm::Optional<std::pair<CmpInst::Predicate, Constant *>> +getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred, Constant *C); + /// Return the source operand of a potentially bitcasted value while optionally /// checking if it has one use. If there is no bitcast or the one use check is /// not met, return the input value itself. @@ -139,31 +181,16 @@ static inline Constant *SubOne(Constant *C) { /// This happens in cases where the ~ can be eliminated. If WillInvertAllUses /// is true, work under the assumption that the caller intends to remove all /// uses of V and only keep uses of ~V. -static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) { +/// +/// See also: canFreelyInvertAllUsersOf() +static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) { // ~(~(X)) -> X. if (match(V, m_Not(m_Value()))) return true; // Constants can be considered to be not'ed values. - if (isa<ConstantInt>(V)) - return true; - - // A vector of constant integers can be inverted easily. - if (V->getType()->isVectorTy() && isa<Constant>(V)) { - unsigned NumElts = V->getType()->getVectorNumElements(); - for (unsigned i = 0; i != NumElts; ++i) { - Constant *Elt = cast<Constant>(V)->getAggregateElement(i); - if (!Elt) - return false; - - if (isa<UndefValue>(Elt)) - continue; - - if (!isa<ConstantInt>(Elt)) - return false; - } + if (match(V, m_AnyIntegralConstant())) return true; - } // Compares can be inverted if all of their uses are being modified to use the // ~V. @@ -185,6 +212,32 @@ static inline bool IsFreeToInvert(Value *V, bool WillInvertAllUses) { return false; } +/// Given i1 V, can every user of V be freely adapted if V is changed to !V ? +/// +/// See also: isFreeToInvert() +static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) { + // Look at every user of V. + for (User *U : V->users()) { + if (U == IgnoredUser) + continue; // Don't consider this user. + + auto *I = cast<Instruction>(U); + switch (I->getOpcode()) { + case Instruction::Select: + case Instruction::Br: + break; // Free to invert by swapping true/false values/destinations. + case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it. + if (!match(I, m_Not(m_Value()))) + return false; // Not a 'not'. + break; + default: + return false; // Don't know, likely not freely invertible. + } + // So far all users were free to invert... + } + return true; // Can freely invert all users! +} + /// Some binary operators require special handling to avoid poison and undefined /// behavior. If a constant vector has undef elements, replace those undefs with /// identity constants if possible because those are always safe to execute. @@ -316,7 +369,8 @@ public: Instruction *visitFNeg(UnaryOperator &I); Instruction *visitAdd(BinaryOperator &I); Instruction *visitFAdd(BinaryOperator &I); - Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty); + Value *OptimizePointerDifference( + Value *LHS, Value *RHS, Type *Ty, bool isNUW); Instruction *visitSub(BinaryOperator &I); Instruction *visitFSub(BinaryOperator &I); Instruction *visitMul(BinaryOperator &I); @@ -337,6 +391,13 @@ public: Instruction *visitOr(BinaryOperator &I); Instruction *visitXor(BinaryOperator &I); Instruction *visitShl(BinaryOperator &I); + Value *reassociateShiftAmtsOfTwoSameDirectionShifts( + BinaryOperator *Sh0, const SimplifyQuery &SQ, + bool AnalyzeForSignBitExtraction = false); + Instruction *canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract( + BinaryOperator &I); + Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr); Instruction *visitAShr(BinaryOperator &I); Instruction *visitLShr(BinaryOperator &I); Instruction *commonShiftTransforms(BinaryOperator &I); @@ -386,6 +447,7 @@ public: Instruction *visitLandingPadInst(LandingPadInst &LI); Instruction *visitVAStartInst(VAStartInst &I); Instruction *visitVACopyInst(VACopyInst &I); + Instruction *visitFreeze(FreezeInst &I); /// Specify what to return for unhandled instructions. Instruction *visitInstruction(Instruction &I) { return nullptr; } @@ -405,6 +467,9 @@ public: /// \return true if successful. bool replacePointer(Instruction &I, Value *V); + LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy, + const Twine &Suffix = ""); + private: bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const; bool shouldChangeType(Type *From, Type *To) const; @@ -541,6 +606,7 @@ private: Instruction *narrowMathIfNoOverflow(BinaryOperator &I); Instruction *narrowRotate(TruncInst &Trunc); Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN); + Instruction *matchSAddSubSat(SelectInst &MinMax1); /// Determine if a pair of casts can be replaced by a single cast. /// @@ -557,7 +623,7 @@ private: Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI); Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI); - Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS); + Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &I); /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp). /// NOTE: Unlike most of instcombine, this returns a Value which should @@ -644,7 +710,7 @@ public: Instruction *eraseInstFromFunction(Instruction &I) { LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n'); assert(I.use_empty() && "Cannot erase instruction that is used!"); - salvageDebugInfo(I); + salvageDebugInfoOrMarkUndef(I); // Make sure that we reprocess all operands now that we reduced their // use counts. @@ -725,7 +791,7 @@ public: Value *LHS, Value *RHS, Instruction *CxtI) const; /// Maximum size of array considered when transforming. - uint64_t MaxArraySizeForCombine; + uint64_t MaxArraySizeForCombine = 0; private: /// Performs a few simplifications for operators which are associative @@ -798,7 +864,8 @@ private: int DmaskIdx = -1); Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, - APInt &UndefElts, unsigned Depth = 0); + APInt &UndefElts, unsigned Depth = 0, + bool AllowMultipleUsers = false); /// Canonicalize the position of binops relative to shufflevector. Instruction *foldVectorBinop(BinaryOperator &Inst); @@ -847,17 +914,21 @@ private: Constant *RHSC); Instruction *foldICmpAddOpConst(Value *X, const APInt &C, ICmpInst::Predicate Pred); - Instruction *foldICmpWithCastAndCast(ICmpInst &ICI); + Instruction *foldICmpWithCastOp(ICmpInst &ICI); Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp); Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp); Instruction *foldICmpWithConstant(ICmpInst &Cmp); Instruction *foldICmpInstWithConstant(ICmpInst &Cmp); Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp); - Instruction *foldICmpBinOp(ICmpInst &Cmp); + Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ); Instruction *foldICmpEquality(ICmpInst &Cmp); + Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I); + Instruction *foldSignBitTest(ICmpInst &I); Instruction *foldICmpWithZero(ICmpInst &Cmp); + Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp); + Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select, ConstantInt *C); Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc, @@ -874,6 +945,8 @@ private: const APInt &C); Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr, const APInt &C); + Instruction *foldICmpSRemConstant(ICmpInst &Cmp, BinaryOperator *UDiv, + const APInt &C); Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv, const APInt &C); Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div, diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 054fb7da09a2..74654f7ef51d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -175,7 +175,7 @@ static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI, uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType()); if (!AllocaSize) return false; - return isDereferenceableAndAlignedPointer(V, AI->getAlignment(), + return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()), APInt(64, AllocaSize), DL); } @@ -197,7 +197,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) { if (C->getValue().getActiveBits() <= 64) { Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName()); - New->setAlignment(AI.getAlignment()); + New->setAlignment(MaybeAlign(AI.getAlignment())); // Scan to the end of the allocation instructions, to skip over a block of // allocas if possible...also skip interleaved debug info @@ -345,7 +345,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { if (AI.getAllocatedType()->isSized()) { // If the alignment is 0 (unspecified), assign it the preferred alignment. if (AI.getAlignment() == 0) - AI.setAlignment(DL.getPrefTypeAlignment(AI.getAllocatedType())); + AI.setAlignment( + MaybeAlign(DL.getPrefTypeAlignment(AI.getAllocatedType()))); // Move all alloca's of zero byte objects to the entry block and merge them // together. Note that we only do this for alloca's, because malloc should @@ -377,12 +378,12 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // assign it the preferred alignment. if (EntryAI->getAlignment() == 0) EntryAI->setAlignment( - DL.getPrefTypeAlignment(EntryAI->getAllocatedType())); + MaybeAlign(DL.getPrefTypeAlignment(EntryAI->getAllocatedType()))); // Replace this zero-sized alloca with the one at the start of the entry // block after ensuring that the address will be aligned enough for both // types. - unsigned MaxAlign = std::max(EntryAI->getAlignment(), - AI.getAlignment()); + const MaybeAlign MaxAlign( + std::max(EntryAI->getAlignment(), AI.getAlignment())); EntryAI->setAlignment(MaxAlign); if (AI.getType() != EntryAI->getType()) return new BitCastInst(EntryAI, AI.getType()); @@ -448,67 +449,30 @@ static bool isSupportedAtomicType(Type *Ty) { /// /// Note that this will create all of the instructions with whatever insert /// point the \c InstCombiner currently is using. -static LoadInst *combineLoadToNewType(InstCombiner &IC, LoadInst &LI, Type *NewTy, - const Twine &Suffix = "") { +LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy, + const Twine &Suffix) { assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) && "can't fold an atomic load to requested type"); Value *Ptr = LI.getPointerOperand(); unsigned AS = LI.getPointerAddressSpace(); - SmallVector<std::pair<unsigned, MDNode *>, 8> MD; - LI.getAllMetadata(MD); - Value *NewPtr = nullptr; if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) && NewPtr->getType()->getPointerElementType() == NewTy && NewPtr->getType()->getPointerAddressSpace() == AS)) - NewPtr = IC.Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)); + NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS)); - LoadInst *NewLoad = IC.Builder.CreateAlignedLoad( - NewTy, NewPtr, LI.getAlignment(), LI.isVolatile(), LI.getName() + Suffix); - NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); - MDBuilder MDB(NewLoad->getContext()); - for (const auto &MDPair : MD) { - unsigned ID = MDPair.first; - MDNode *N = MDPair.second; - // Note, essentially every kind of metadata should be preserved here! This - // routine is supposed to clone a load instruction changing *only its type*. - // The only metadata it makes sense to drop is metadata which is invalidated - // when the pointer type changes. This should essentially never be the case - // in LLVM, but we explicitly switch over only known metadata to be - // conservatively correct. If you are adding metadata to LLVM which pertains - // to loads, you almost certainly want to add it here. - switch (ID) { - case LLVMContext::MD_dbg: - case LLVMContext::MD_tbaa: - case LLVMContext::MD_prof: - case LLVMContext::MD_fpmath: - case LLVMContext::MD_tbaa_struct: - case LLVMContext::MD_invariant_load: - case LLVMContext::MD_alias_scope: - case LLVMContext::MD_noalias: - case LLVMContext::MD_nontemporal: - case LLVMContext::MD_mem_parallel_loop_access: - case LLVMContext::MD_access_group: - // All of these directly apply. - NewLoad->setMetadata(ID, N); - break; + unsigned Align = LI.getAlignment(); + if (!Align) + // If old load did not have an explicit alignment specified, + // manually preserve the implied (ABI) alignment of the load. + // Else we may inadvertently incorrectly over-promise alignment. + Align = getDataLayout().getABITypeAlignment(LI.getType()); - case LLVMContext::MD_nonnull: - copyNonnullMetadata(LI, N, *NewLoad); - break; - case LLVMContext::MD_align: - case LLVMContext::MD_dereferenceable: - case LLVMContext::MD_dereferenceable_or_null: - // These only directly apply if the new type is also a pointer. - if (NewTy->isPointerTy()) - NewLoad->setMetadata(ID, N); - break; - case LLVMContext::MD_range: - copyRangeMetadata(IC.getDataLayout(), LI, N, *NewLoad); - break; - } - } + LoadInst *NewLoad = Builder.CreateAlignedLoad( + NewTy, NewPtr, Align, LI.isVolatile(), LI.getName() + Suffix); + NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); + copyMetadataForLoad(*NewLoad, LI); return NewLoad; } @@ -569,7 +533,7 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value /// Returns true if instruction represent minmax pattern like: /// select ((cmp load V1, load V2), V1, V2). -static bool isMinMaxWithLoads(Value *V) { +static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) { assert(V->getType()->isPointerTy() && "Expected pointer type."); // Ignore possible ty* to ixx* bitcast. V = peekThroughBitcast(V); @@ -583,6 +547,7 @@ static bool isMinMaxWithLoads(Value *V) { if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)), m_Value(LHS), m_Value(RHS)))) return false; + LoadTy = L1->getType(); return (match(L1, m_Load(m_Specific(LHS))) && match(L2, m_Load(m_Specific(RHS)))) || (match(L1, m_Load(m_Specific(RHS))) && @@ -628,20 +593,22 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { // size is a legal integer type. // Do not perform canonicalization if minmax pattern is found (to avoid // infinite loop). + Type *Dummy; if (!Ty->isIntegerTy() && Ty->isSized() && + !(Ty->isVectorTy() && Ty->getVectorIsScalable()) && DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) && DL.typeSizeEqualsStoreSize(Ty) && !DL.isNonIntegralPointerType(Ty) && !isMinMaxWithLoads( - peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true))) { + peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true), + Dummy)) { if (all_of(LI.users(), [&LI](User *U) { auto *SI = dyn_cast<StoreInst>(U); return SI && SI->getPointerOperand() != &LI && !SI->getPointerOperand()->isSwiftError(); })) { - LoadInst *NewLoad = combineLoadToNewType( - IC, LI, - Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty))); + LoadInst *NewLoad = IC.combineLoadToNewType( + LI, Type::getIntNTy(LI.getContext(), DL.getTypeStoreSizeInBits(Ty))); // Replace all the stores with stores of the newly loaded value. for (auto UI = LI.user_begin(), UE = LI.user_end(); UI != UE;) { auto *SI = cast<StoreInst>(*UI++); @@ -663,7 +630,7 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) { if (auto* CI = dyn_cast<CastInst>(LI.user_back())) if (CI->isNoopCast(DL)) if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) { - LoadInst *NewLoad = combineLoadToNewType(IC, LI, CI->getDestTy()); + LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy()); CI->replaceAllUsesWith(NewLoad); IC.eraseInstFromFunction(*CI); return &LI; @@ -691,8 +658,8 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { // If the struct only have one element, we unpack. auto NumElements = ST->getNumElements(); if (NumElements == 1) { - LoadInst *NewLoad = combineLoadToNewType(IC, LI, ST->getTypeAtIndex(0U), - ".unpack"); + LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U), + ".unpack"); AAMDNodes AAMD; LI.getAAMetadata(AAMD); NewLoad->setAAMetadata(AAMD); @@ -741,7 +708,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) { auto *ET = AT->getElementType(); auto NumElements = AT->getNumElements(); if (NumElements == 1) { - LoadInst *NewLoad = combineLoadToNewType(IC, LI, ET, ".unpack"); + LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack"); AAMDNodes AAMD; LI.getAAMetadata(AAMD); NewLoad->setAAMetadata(AAMD); @@ -1004,9 +971,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType()); if (KnownAlign > EffectiveLoadAlign) - LI.setAlignment(KnownAlign); + LI.setAlignment(MaybeAlign(KnownAlign)); else if (LoadAlign == 0) - LI.setAlignment(EffectiveLoadAlign); + LI.setAlignment(MaybeAlign(EffectiveLoadAlign)); // Replace GEP indices if possible. if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) { @@ -1063,11 +1030,11 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { // if (SelectInst *SI = dyn_cast<SelectInst>(Op)) { // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2). - unsigned Align = LI.getAlignment(); - if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(), Align, - DL, SI) && - isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(), Align, - DL, SI)) { + const MaybeAlign Alignment(LI.getAlignment()); + if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(), + Alignment, DL, SI) && + isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(), + Alignment, DL, SI)) { LoadInst *V1 = Builder.CreateLoad(LI.getType(), SI->getOperand(1), SI->getOperand(1)->getName() + ".val"); @@ -1075,9 +1042,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { Builder.CreateLoad(LI.getType(), SI->getOperand(2), SI->getOperand(2)->getName() + ".val"); assert(LI.isUnordered() && "implied by above"); - V1->setAlignment(Align); + V1->setAlignment(Alignment); V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); - V2->setAlignment(Align); + V2->setAlignment(Alignment); V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); return SelectInst::Create(SI->getCondition(), V1, V2); } @@ -1365,7 +1332,19 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC, auto *LI = cast<LoadInst>(SI.getValueOperand()); if (!LI->getType()->isIntegerTy()) return false; - if (!isMinMaxWithLoads(LoadAddr)) + Type *CmpLoadTy; + if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy)) + return false; + + // Make sure the type would actually change. + // This condition can be hit with chains of bitcasts. + if (LI->getType() == CmpLoadTy) + return false; + + // Make sure we're not changing the size of the load/store. + const auto &DL = IC.getDataLayout(); + if (DL.getTypeStoreSizeInBits(LI->getType()) != + DL.getTypeStoreSizeInBits(CmpLoadTy)) return false; if (!all_of(LI->users(), [LI, LoadAddr](User *U) { @@ -1377,8 +1356,7 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC, return false; IC.Builder.SetInsertPoint(LI); - LoadInst *NewLI = combineLoadToNewType( - IC, *LI, LoadAddr->getType()->getPointerElementType()); + LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy); // Replace all the stores with stores of the newly loaded value. for (auto *UI : LI->users()) { auto *USI = cast<StoreInst>(UI); @@ -1399,15 +1377,15 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { return eraseInstFromFunction(SI); // Attempt to improve the alignment. - unsigned KnownAlign = getOrEnforceKnownAlignment( - Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT); - unsigned StoreAlign = SI.getAlignment(); - unsigned EffectiveStoreAlign = - StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType()); + const Align KnownAlign = Align(getOrEnforceKnownAlignment( + Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT)); + const MaybeAlign StoreAlign = MaybeAlign(SI.getAlignment()); + const Align EffectiveStoreAlign = + StoreAlign ? *StoreAlign : Align(DL.getABITypeAlignment(Val->getType())); if (KnownAlign > EffectiveStoreAlign) SI.setAlignment(KnownAlign); - else if (StoreAlign == 0) + else if (!StoreAlign) SI.setAlignment(EffectiveStoreAlign); // Try to canonicalize the stored type. @@ -1466,9 +1444,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1))) { ++NumDeadStore; - ++BBI; + // Manually add back the original store to the worklist now, so it will + // be processed after the operands of the removed store, as this may + // expose additional DSE opportunities. + Worklist.Add(&SI); eraseInstFromFunction(*PrevSI); - continue; + return nullptr; } break; } @@ -1622,8 +1603,8 @@ bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) { // Advance to a place where it is safe to insert the new store and insert it. BBI = DestBB->getFirstInsertionPt(); - StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1), - SI.isVolatile(), SI.getAlignment(), + StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), + MaybeAlign(SI.getAlignment()), SI.getOrdering(), SI.getSyncScopeID()); InsertNewInstBefore(NewSI, *BBI); NewSI->setDebugLoc(MergedLoc); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index cc753ce05313..2774e46151fa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -124,6 +124,50 @@ static Constant *getLogBase2(Type *Ty, Constant *C) { return ConstantVector::get(Elts); } +// TODO: This is a specific form of a much more general pattern. +// We could detect a select with any binop identity constant, or we +// could use SimplifyBinOp to see if either arm of the select reduces. +// But that needs to be done carefully and/or while removing potential +// reverse canonicalizations as in InstCombiner::foldSelectIntoOp(). +static Value *foldMulSelectToNegate(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + Value *Cond, *OtherOp; + + // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp + // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp + if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())), + m_Value(OtherOp)))) + return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp)); + + // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp + // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp + if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())), + m_Value(OtherOp)))) + return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp); + + // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp + // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp + if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0), + m_SpecificFP(-1.0))), + m_Value(OtherOp)))) { + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(I.getFastMathFlags()); + return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp)); + } + + // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp + // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp + if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0), + m_SpecificFP(1.0))), + m_Value(OtherOp)))) { + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(I.getFastMathFlags()); + return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp); + } + + return nullptr; +} + Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1), SQ.getWithInstruction(&I))) @@ -213,6 +257,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) return FoldedMul; + if (Value *FoldedMul = foldMulSelectToNegate(I, Builder)) + return replaceInstUsesWith(I, FoldedMul); + // Simplify mul instructions with a constant RHS. if (isa<Constant>(Op1)) { // Canonicalize (X+C1)*CI -> X*CI+C1*CI. @@ -358,6 +405,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I)) return FoldedMul; + if (Value *FoldedMul = foldMulSelectToNegate(I, Builder)) + return replaceInstUsesWith(I, FoldedMul); + // X * -1.0 --> -X Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); if (match(Op1, m_SpecificFP(-1.0))) @@ -373,16 +423,6 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) { if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C))) return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I); - // Sink negation: -X * Y --> -(X * Y) - // But don't transform constant expressions because there's an inverse fold. - if (match(Op0, m_OneUse(m_FNeg(m_Value(X)))) && !isa<ConstantExpr>(Op0)) - return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op1, &I), &I); - - // Sink negation: Y * -X --> -(X * Y) - // But don't transform constant expressions because there's an inverse fold. - if (match(Op1, m_OneUse(m_FNeg(m_Value(X)))) && !isa<ConstantExpr>(Op1)) - return BinaryOperator::CreateFNegFMF(Builder.CreateFMulFMF(X, Op0, &I), &I); - // fabs(X) * fabs(X) -> X * X if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X)))) return BinaryOperator::CreateFMulFMF(X, X, &I); @@ -1199,6 +1239,14 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I); return BinaryOperator::CreateFDivFMF(YZ, X, &I); } + // Z / (1.0 / Y) => (Y * Z) + // + // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The + // m_OneUse check is avoided because even in the case of the multiple uses + // for 1.0/Y, the number of instructions remain the same and a division is + // replaced by a multiplication. + if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y)))) + return BinaryOperator::CreateFMulFMF(Y, Op0, &I); } if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) { @@ -1211,8 +1259,8 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) && match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X))); - if ((IsTan || IsCot) && hasUnaryFloatFn(&TLI, I.getType(), LibFunc_tan, - LibFunc_tanf, LibFunc_tanl)) { + if ((IsTan || IsCot) && + hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) { IRBuilder<> B(&I); IRBuilder<>::FastMathFlagGuard FMFGuard(B); B.setFastMathFlags(I.getFastMathFlags()); @@ -1244,6 +1292,17 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) { return &I; } + // X / fabs(X) -> copysign(1.0, X) + // fabs(X) / X -> copysign(1.0, X) + if (I.hasNoNaNs() && I.hasNoInfs() && + (match(&I, + m_FDiv(m_Value(X), m_Intrinsic<Intrinsic::fabs>(m_Deferred(X)))) || + match(&I, m_FDiv(m_Intrinsic<Intrinsic::fabs>(m_Value(X)), + m_Deferred(X))))) { + Value *V = Builder.CreateBinaryIntrinsic( + Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I); + return replaceInstUsesWith(I, V); + } return nullptr; } @@ -1309,14 +1368,18 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); Type *Ty = I.getType(); if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) { + // This may increase instruction count, we don't enforce that Y is a + // constant. Constant *N1 = Constant::getAllOnesValue(Ty); Value *Add = Builder.CreateAdd(Op1, N1); return BinaryOperator::CreateAnd(Op0, Add); } // 1 urem X -> zext(X != 1) - if (match(Op0, m_One())) - return CastInst::CreateZExtOrBitCast(Builder.CreateICmpNE(Op1, Op0), Ty); + if (match(Op0, m_One())) { + Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1)); + return CastInst::CreateZExtOrBitCast(Cmp, Ty); + } // X urem C -> X < C ? X : X - C, where C >= signbit. if (match(Op1, m_Negative())) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index 5820ab726637..74e015a4f1d4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -14,9 +14,10 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -180,13 +181,14 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) { "Not enough available ptr typed incoming values"); PHINode *MatchingPtrPHI = nullptr; unsigned NumPhis = 0; - for (auto II = BB->begin(), EI = BasicBlock::iterator(BB->getFirstNonPHI()); - II != EI; II++, NumPhis++) { + for (auto II = BB->begin(); II != BB->end(); II++, NumPhis++) { // FIXME: consider handling this in AggressiveInstCombine + PHINode *PtrPHI = dyn_cast<PHINode>(II); + if (!PtrPHI) + break; if (NumPhis > MaxNumPhis) return nullptr; - PHINode *PtrPHI = dyn_cast<PHINode>(II); - if (!PtrPHI || PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType()) + if (PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType()) continue; MatchingPtrPHI = PtrPHI; for (unsigned i = 0; i != PtrPHI->getNumIncomingValues(); ++i) { @@ -542,7 +544,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { // visitLoadInst will propagate an alignment onto the load when TD is around, // and if TD isn't around, we can't handle the mixed case. bool isVolatile = FirstLI->isVolatile(); - unsigned LoadAlignment = FirstLI->getAlignment(); + MaybeAlign LoadAlignment(FirstLI->getAlignment()); unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace(); // We can't sink the load if the loaded value could be modified between the @@ -574,10 +576,10 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) { // If some of the loads have an alignment specified but not all of them, // we can't do the transformation. - if ((LoadAlignment != 0) != (LI->getAlignment() != 0)) + if ((LoadAlignment.hasValue()) != (LI->getAlignment() != 0)) return nullptr; - LoadAlignment = std::min(LoadAlignment, LI->getAlignment()); + LoadAlignment = std::min(LoadAlignment, MaybeAlign(LI->getAlignment())); // If the PHI is of volatile loads and the load block has multiple // successors, sinking it would remove a load of the volatile value from diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index aefaf5af1750..49645e9460cd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -704,16 +704,24 @@ static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI, assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) && "Unexpected isUnsigned predicate!"); - // Account for swapped form of subtraction: ((a > b) ? b - a : 0). + // Ensure the sub is of the form: + // (a > b) ? a - b : 0 -> usub.sat(a, b) + // (a > b) ? b - a : 0 -> -usub.sat(a, b) + // Checking for both a-b and a+(-b) as a constant. bool IsNegative = false; - if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A)))) + const APInt *C; + if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))) || + (match(A, m_APInt(C)) && + match(TrueVal, m_Add(m_Specific(B), m_SpecificInt(-*C))))) IsNegative = true; - else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B)))) + else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))) && + !(match(B, m_APInt(C)) && + match(TrueVal, m_Add(m_Specific(A), m_SpecificInt(-*C))))) return nullptr; - // If sub is used anywhere else, we wouldn't be able to eliminate it - // afterwards. - if (!TrueVal->hasOneUse()) + // If we are adding a negate and the sub and icmp are used anywhere else, we + // would end up with more instructions. + if (IsNegative && !TrueVal->hasOneUse() && !ICI->hasOneUse()) return nullptr; // (a > b) ? a - b : 0 -> usub.sat(a, b) @@ -781,10 +789,52 @@ static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal, return Builder.CreateBinaryIntrinsic( Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1)); } + // The overflow may be detected via the add wrapping round. + if (match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) && + match(FVal, m_c_Add(m_Specific(Cmp1), m_Specific(Y)))) { + // ((X + Y) u< X) ? -1 : (X + Y) --> uadd.sat(X, Y) + // ((X + Y) u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y) + return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, Cmp1, Y); + } return nullptr; } +/// Fold the following code sequence: +/// \code +/// int a = ctlz(x & -x); +// x ? 31 - a : a; +/// \code +/// +/// into: +/// cttz(x) +static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal, + Value *FalseVal, + InstCombiner::BuilderTy &Builder) { + unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits(); + if (!ICI->isEquality() || !match(ICI->getOperand(1), m_Zero())) + return nullptr; + + if (ICI->getPredicate() == ICmpInst::ICMP_NE) + std::swap(TrueVal, FalseVal); + + if (!match(FalseVal, + m_Xor(m_Deferred(TrueVal), m_SpecificInt(BitWidth - 1)))) + return nullptr; + + if (!match(TrueVal, m_Intrinsic<Intrinsic::ctlz>())) + return nullptr; + + Value *X = ICI->getOperand(0); + auto *II = cast<IntrinsicInst>(TrueVal); + if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X))))) + return nullptr; + + Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz, + II->getType()); + return CallInst::Create(F, {X, II->getArgOperand(1)}); +} + /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single /// call to cttz/ctlz with flag 'is_zero_undef' cleared. /// @@ -963,6 +1013,12 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp, Cmp.getPredicate() == CanonicalPred) return nullptr; + // Bail out on unsimplified X-0 operand (due to some worklist management bug), + // as this may cause an infinite combine loop. Let the sub be folded first. + if (match(LHS, m_Sub(m_Value(), m_Zero())) || + match(RHS, m_Sub(m_Value(), m_Zero()))) + return nullptr; + // Create the canonical compare and plug it into the select. Sel.setCondition(Builder.CreateICmp(CanonicalPred, LHS, RHS)); @@ -973,8 +1029,7 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp, // If we are swapping the select operands, swap the metadata too. assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS && "Unexpected results from matchSelectPattern"); - Sel.setTrueValue(LHS); - Sel.setFalseValue(RHS); + Sel.swapValues(); Sel.swapProfMetadata(); return &Sel; } @@ -1056,17 +1111,293 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp, } // We are swapping the select operands, so swap the metadata too. - Sel.setTrueValue(FVal); - Sel.setFalseValue(TVal); + Sel.swapValues(); Sel.swapProfMetadata(); return &Sel; } +static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *ReplaceOp, + const SimplifyQuery &Q) { + // If this is a binary operator, try to simplify it with the replaced op + // because we know Op and ReplaceOp are equivalant. + // For example: V = X + 1, Op = X, ReplaceOp = 42 + // Simplifies as: add(42, 1) --> 43 + if (auto *BO = dyn_cast<BinaryOperator>(V)) { + if (BO->getOperand(0) == Op) + return SimplifyBinOp(BO->getOpcode(), ReplaceOp, BO->getOperand(1), Q); + if (BO->getOperand(1) == Op) + return SimplifyBinOp(BO->getOpcode(), BO->getOperand(0), ReplaceOp, Q); + } + + return nullptr; +} + +/// If we have a select with an equality comparison, then we know the value in +/// one of the arms of the select. See if substituting this value into an arm +/// and simplifying the result yields the same value as the other arm. +/// +/// To make this transform safe, we must drop poison-generating flags +/// (nsw, etc) if we simplified to a binop because the select may be guarding +/// that poison from propagating. If the existing binop already had no +/// poison-generating flags, then this transform can be done by instsimplify. +/// +/// Consider: +/// %cmp = icmp eq i32 %x, 2147483647 +/// %add = add nsw i32 %x, 1 +/// %sel = select i1 %cmp, i32 -2147483648, i32 %add +/// +/// We can't replace %sel with %add unless we strip away the flags. +/// TODO: Wrapping flags could be preserved in some cases with better analysis. +static Value *foldSelectValueEquivalence(SelectInst &Sel, ICmpInst &Cmp, + const SimplifyQuery &Q) { + if (!Cmp.isEquality()) + return nullptr; + + // Canonicalize the pattern to ICMP_EQ by swapping the select operands. + Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue(); + if (Cmp.getPredicate() == ICmpInst::ICMP_NE) + std::swap(TrueVal, FalseVal); + + // Try each equivalence substitution possibility. + // We have an 'EQ' comparison, so the select's false value will propagate. + // Example: + // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1 + // (X == 42) ? (X + 1) : 43 --> (X == 42) ? (42 + 1) : 43 --> 43 + Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1); + if (simplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, Q) == TrueVal || + simplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, Q) == TrueVal || + simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, Q) == FalseVal || + simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, Q) == FalseVal) { + if (auto *FalseInst = dyn_cast<Instruction>(FalseVal)) + FalseInst->dropPoisonGeneratingFlags(); + return FalseVal; + } + return nullptr; +} + +// See if this is a pattern like: +// %old_cmp1 = icmp slt i32 %x, C2 +// %old_replacement = select i1 %old_cmp1, i32 %target_low, i32 %target_high +// %old_x_offseted = add i32 %x, C1 +// %old_cmp0 = icmp ult i32 %old_x_offseted, C0 +// %r = select i1 %old_cmp0, i32 %x, i32 %old_replacement +// This can be rewritten as more canonical pattern: +// %new_cmp1 = icmp slt i32 %x, -C1 +// %new_cmp2 = icmp sge i32 %x, C0-C1 +// %new_clamped_low = select i1 %new_cmp1, i32 %target_low, i32 %x +// %r = select i1 %new_cmp2, i32 %target_high, i32 %new_clamped_low +// Iff -C1 s<= C2 s<= C0-C1 +// Also ULT predicate can also be UGT iff C0 != -1 (+invert result) +// SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.) +static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0, + InstCombiner::BuilderTy &Builder) { + Value *X = Sel0.getTrueValue(); + Value *Sel1 = Sel0.getFalseValue(); + + // First match the condition of the outermost select. + // Said condition must be one-use. + if (!Cmp0.hasOneUse()) + return nullptr; + Value *Cmp00 = Cmp0.getOperand(0); + Constant *C0; + if (!match(Cmp0.getOperand(1), + m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0)))) + return nullptr; + // Canonicalize Cmp0 into the form we expect. + // FIXME: we shouldn't care about lanes that are 'undef' in the end? + switch (Cmp0.getPredicate()) { + case ICmpInst::Predicate::ICMP_ULT: + break; // Great! + case ICmpInst::Predicate::ICMP_ULE: + // We'd have to increment C0 by one, and for that it must not have all-ones + // element, but then it would have been canonicalized to 'ult' before + // we get here. So we can't do anything useful with 'ule'. + return nullptr; + case ICmpInst::Predicate::ICMP_UGT: + // We want to canonicalize it to 'ult', so we'll need to increment C0, + // which again means it must not have any all-ones elements. + if (!match(C0, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, + APInt::getAllOnesValue( + C0->getType()->getScalarSizeInBits())))) + return nullptr; // Can't do, have all-ones element[s]. + C0 = AddOne(C0); + std::swap(X, Sel1); + break; + case ICmpInst::Predicate::ICMP_UGE: + // The only way we'd get this predicate if this `icmp` has extra uses, + // but then we won't be able to do this fold. + return nullptr; + default: + return nullptr; // Unknown predicate. + } + + // Now that we've canonicalized the ICmp, we know the X we expect; + // the select in other hand should be one-use. + if (!Sel1->hasOneUse()) + return nullptr; + + // We now can finish matching the condition of the outermost select: + // it should either be the X itself, or an addition of some constant to X. + Constant *C1; + if (Cmp00 == X) + C1 = ConstantInt::getNullValue(Sel0.getType()); + else if (!match(Cmp00, + m_Add(m_Specific(X), + m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1))))) + return nullptr; + + Value *Cmp1; + ICmpInst::Predicate Pred1; + Constant *C2; + Value *ReplacementLow, *ReplacementHigh; + if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow), + m_Value(ReplacementHigh))) || + !match(Cmp1, + m_ICmp(Pred1, m_Specific(X), + m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C2))))) + return nullptr; + + if (!Cmp1->hasOneUse() && (Cmp00 == X || !Cmp00->hasOneUse())) + return nullptr; // Not enough one-use instructions for the fold. + // FIXME: this restriction could be relaxed if Cmp1 can be reused as one of + // two comparisons we'll need to build. + + // Canonicalize Cmp1 into the form we expect. + // FIXME: we shouldn't care about lanes that are 'undef' in the end? + switch (Pred1) { + case ICmpInst::Predicate::ICMP_SLT: + break; + case ICmpInst::Predicate::ICMP_SLE: + // We'd have to increment C2 by one, and for that it must not have signed + // max element, but then it would have been canonicalized to 'slt' before + // we get here. So we can't do anything useful with 'sle'. + return nullptr; + case ICmpInst::Predicate::ICMP_SGT: + // We want to canonicalize it to 'slt', so we'll need to increment C2, + // which again means it must not have any signed max elements. + if (!match(C2, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE, + APInt::getSignedMaxValue( + C2->getType()->getScalarSizeInBits())))) + return nullptr; // Can't do, have signed max element[s]. + C2 = AddOne(C2); + LLVM_FALLTHROUGH; + case ICmpInst::Predicate::ICMP_SGE: + // Also non-canonical, but here we don't need to change C2, + // so we don't have any restrictions on C2, so we can just handle it. + std::swap(ReplacementLow, ReplacementHigh); + break; + default: + return nullptr; // Unknown predicate. + } + + // The thresholds of this clamp-like pattern. + auto *ThresholdLowIncl = ConstantExpr::getNeg(C1); + auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1); + + // The fold has a precondition 1: C2 s>= ThresholdLow + auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2, + ThresholdLowIncl); + if (!match(Precond1, m_One())) + return nullptr; + // The fold has a precondition 2: C2 s<= ThresholdHigh + auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2, + ThresholdHighExcl); + if (!match(Precond2, m_One())) + return nullptr; + + // All good, finally emit the new pattern. + Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl); + Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl); + Value *MaybeReplacedLow = + Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X); + Instruction *MaybeReplacedHigh = + SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow); + + return MaybeReplacedHigh; +} + +// If we have +// %cmp = icmp [canonical predicate] i32 %x, C0 +// %r = select i1 %cmp, i32 %y, i32 C1 +// Where C0 != C1 and %x may be different from %y, see if the constant that we +// will have if we flip the strictness of the predicate (i.e. without changing +// the result) is identical to the C1 in select. If it matches we can change +// original comparison to one with swapped predicate, reuse the constant, +// and swap the hands of select. +static Instruction * +tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp, + InstCombiner::BuilderTy &Builder) { + ICmpInst::Predicate Pred; + Value *X; + Constant *C0; + if (!match(&Cmp, m_OneUse(m_ICmp( + Pred, m_Value(X), + m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0)))))) + return nullptr; + + // If comparison predicate is non-relational, we won't be able to do anything. + if (ICmpInst::isEquality(Pred)) + return nullptr; + + // If comparison predicate is non-canonical, then we certainly won't be able + // to make it canonical; canonicalizeCmpWithConstant() already tried. + if (!isCanonicalPredicate(Pred)) + return nullptr; + + // If the [input] type of comparison and select type are different, lets abort + // for now. We could try to compare constants with trunc/[zs]ext though. + if (C0->getType() != Sel.getType()) + return nullptr; + + // FIXME: are there any magic icmp predicate+constant pairs we must not touch? + + Value *SelVal0, *SelVal1; // We do not care which one is from where. + match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1))); + // At least one of these values we are selecting between must be a constant + // else we'll never succeed. + if (!match(SelVal0, m_AnyIntegralConstant()) && + !match(SelVal1, m_AnyIntegralConstant())) + return nullptr; + + // Does this constant C match any of the `select` values? + auto MatchesSelectValue = [SelVal0, SelVal1](Constant *C) { + return C->isElementWiseEqual(SelVal0) || C->isElementWiseEqual(SelVal1); + }; + + // If C0 *already* matches true/false value of select, we are done. + if (MatchesSelectValue(C0)) + return nullptr; + + // Check the constant we'd have with flipped-strictness predicate. + auto FlippedStrictness = getFlippedStrictnessPredicateAndConstant(Pred, C0); + if (!FlippedStrictness) + return nullptr; + + // If said constant doesn't match either, then there is no hope, + if (!MatchesSelectValue(FlippedStrictness->second)) + return nullptr; + + // It matched! Lets insert the new comparison just before select. + InstCombiner::BuilderTy::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(&Sel); + + Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped. + Value *NewCmp = Builder.CreateICmp(Pred, X, FlippedStrictness->second, + Cmp.getName() + ".inv"); + Sel.setCondition(NewCmp); + Sel.swapValues(); + Sel.swapProfMetadata(); + + return &Sel; +} + /// Visit a SelectInst that has an ICmpInst as its first operand. Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI) { - Value *TrueVal = SI.getTrueValue(); - Value *FalseVal = SI.getFalseValue(); + if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ)) + return replaceInstUsesWith(SI, V); if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder)) return NewSel; @@ -1074,12 +1405,21 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI, if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, Builder)) return NewAbs; + if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder)) + return NewAbs; + + if (Instruction *NewSel = + tryToReuseConstantFromSelectInComparison(SI, *ICI, Builder)) + return NewSel; + bool Changed = adjustMinMax(SI, *ICI); if (Value *V = foldSelectICmpAnd(SI, ICI, Builder)) return replaceInstUsesWith(SI, V); // NOTE: if we wanted to, this is where to detect integer MIN/MAX + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); ICmpInst::Predicate Pred = ICI->getPredicate(); Value *CmpLHS = ICI->getOperand(0); Value *CmpRHS = ICI->getOperand(1); @@ -1149,6 +1489,9 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI, foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder)) return V; + if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder)) + return V; + if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder)) return replaceInstUsesWith(SI, V); @@ -1253,6 +1596,16 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner, } } + // max(max(A, B), min(A, B)) --> max(A, B) + // min(min(A, B), max(A, B)) --> min(A, B) + // TODO: This could be done in instsimplify. + if (SPF1 == SPF2 && + ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) || + (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) || + (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) || + (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B)))))) + return replaceInstUsesWith(Outer, Inner); + // ABS(ABS(X)) -> ABS(X) // NABS(NABS(X)) -> NABS(X) // TODO: This could be done in instsimplify. @@ -1280,7 +1633,7 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner, return true; } - if (IsFreeToInvert(V, !V->hasNUsesOrMore(3))) { + if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) { NotV = nullptr; return true; } @@ -1393,6 +1746,128 @@ static Instruction *foldAddSubSelect(SelectInst &SI, return nullptr; } +/// Turn X + Y overflows ? -1 : X + Y -> uadd_sat X, Y +/// And X - Y overflows ? 0 : X - Y -> usub_sat X, Y +/// Along with a number of patterns similar to: +/// X + Y overflows ? (X < 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y +/// X - Y overflows ? (X > 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y +static Instruction * +foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) { + Value *CondVal = SI.getCondition(); + Value *TrueVal = SI.getTrueValue(); + Value *FalseVal = SI.getFalseValue(); + + WithOverflowInst *II; + if (!match(CondVal, m_ExtractValue<1>(m_WithOverflowInst(II))) || + !match(FalseVal, m_ExtractValue<0>(m_Specific(II)))) + return nullptr; + + Value *X = II->getLHS(); + Value *Y = II->getRHS(); + + auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) { + Type *Ty = Limit->getType(); + + ICmpInst::Predicate Pred; + Value *TrueVal, *FalseVal, *Op; + const APInt *C; + if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)), + m_Value(TrueVal), m_Value(FalseVal)))) + return false; + + auto IsZeroOrOne = [](const APInt &C) { + return C.isNullValue() || C.isOneValue(); + }; + auto IsMinMax = [&](Value *Min, Value *Max) { + APInt MinVal = APInt::getSignedMinValue(Ty->getScalarSizeInBits()); + APInt MaxVal = APInt::getSignedMaxValue(Ty->getScalarSizeInBits()); + return match(Min, m_SpecificInt(MinVal)) && + match(Max, m_SpecificInt(MaxVal)); + }; + + if (Op != X && Op != Y) + return false; + + if (IsAdd) { + // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + if (Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) && + IsMinMax(TrueVal, FalseVal)) + return true; + // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + if (Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) && + IsMinMax(FalseVal, TrueVal)) + return true; + } else { + // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + if (Op == X && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C + 1) && + IsMinMax(TrueVal, FalseVal)) + return true; + // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + if (Op == X && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 2) && + IsMinMax(FalseVal, TrueVal)) + return true; + // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + if (Op == Y && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) && + IsMinMax(FalseVal, TrueVal)) + return true; + // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + if (Op == Y && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) && + IsMinMax(TrueVal, FalseVal)) + return true; + } + + return false; + }; + + Intrinsic::ID NewIntrinsicID; + if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow && + match(TrueVal, m_AllOnes())) + // X + Y overflows ? -1 : X + Y -> uadd_sat X, Y + NewIntrinsicID = Intrinsic::uadd_sat; + else if (II->getIntrinsicID() == Intrinsic::usub_with_overflow && + match(TrueVal, m_Zero())) + // X - Y overflows ? 0 : X - Y -> usub_sat X, Y + NewIntrinsicID = Intrinsic::usub_sat; + else if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow && + IsSignedSaturateLimit(TrueVal, /*IsAdd=*/true)) + // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y + NewIntrinsicID = Intrinsic::sadd_sat; + else if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow && + IsSignedSaturateLimit(TrueVal, /*IsAdd=*/false)) + // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y + NewIntrinsicID = Intrinsic::ssub_sat; + else + return nullptr; + + Function *F = + Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType()); + return CallInst::Create(F, {X, Y}); +} + Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) { Constant *C; if (!match(Sel.getTrueValue(), m_Constant(C)) && @@ -1492,6 +1967,30 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) { ConstantVector::get(Mask)); } +/// If we have a select of vectors with a scalar condition, try to convert that +/// to a vector select by splatting the condition. A splat may get folded with +/// other operations in IR and having all operands of a select be vector types +/// is likely better for vector codegen. +static Instruction *canonicalizeScalarSelectOfVecs( + SelectInst &Sel, InstCombiner::BuilderTy &Builder) { + Type *Ty = Sel.getType(); + if (!Ty->isVectorTy()) + return nullptr; + + // We can replace a single-use extract with constant index. + Value *Cond = Sel.getCondition(); + if (!match(Cond, m_OneUse(m_ExtractElement(m_Value(), m_ConstantInt())))) + return nullptr; + + // select (extelt V, Index), T, F --> select (splat V, Index), T, F + // Splatting the extracted condition reduces code (we could directly create a + // splat shuffle of the source vector to eliminate the intermediate step). + unsigned NumElts = Ty->getVectorNumElements(); + Value *SplatCond = Builder.CreateVectorSplat(NumElts, Cond); + Sel.setCondition(SplatCond); + return &Sel; +} + /// Reuse bitcasted operands between a compare and select: /// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) --> /// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D)) @@ -1648,6 +2147,71 @@ static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X, return nullptr; } +/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value. +Instruction *InstCombiner::matchSAddSubSat(SelectInst &MinMax1) { + Type *Ty = MinMax1.getType(); + + // We are looking for a tree of: + // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B)))) + // Where the min and max could be reversed + Instruction *MinMax2; + BinaryOperator *AddSub; + const APInt *MinValue, *MaxValue; + if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) { + if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue)))) + return nullptr; + } else if (match(&MinMax1, + m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) { + if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue)))) + return nullptr; + } else + return nullptr; + + // Check that the constants clamp a saturate, and that the new type would be + // sensible to convert to. + if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1) + return nullptr; + // In what bitwidth can this be treated as saturating arithmetics? + unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1; + // FIXME: This isn't quite right for vectors, but using the scalar type is a + // good first approximation for what should be done there. + if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth)) + return nullptr; + + // Also make sure that the number of uses is as expected. The "3"s are for the + // the two items of min/max (the compare and the select). + if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3)) + return nullptr; + + // Create the new type (which can be a vector type) + Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth); + // Match the two extends from the add/sub + Value *A, *B; + if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B))))) + return nullptr; + // And check the incoming values are of a type smaller than or equal to the + // size of the saturation. Otherwise the higher bits can cause different + // results. + if (A->getType()->getScalarSizeInBits() > NewBitWidth || + B->getType()->getScalarSizeInBits() > NewBitWidth) + return nullptr; + + Intrinsic::ID IntrinsicID; + if (AddSub->getOpcode() == Instruction::Add) + IntrinsicID = Intrinsic::sadd_sat; + else if (AddSub->getOpcode() == Instruction::Sub) + IntrinsicID = Intrinsic::ssub_sat; + else + return nullptr; + + // Finally create and return the sat intrinsic, truncated to the new type + Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy); + Value *AT = Builder.CreateSExt(A, NewTy); + Value *BT = Builder.CreateSExt(B, NewTy); + Value *Sat = Builder.CreateCall(F, {AT, BT}); + return CastInst::Create(Instruction::SExt, Sat, Ty); +} + /// Reduce a sequence of min/max with a common operand. static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS, Value *RHS, @@ -1788,6 +2352,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *I = canonicalizeSelectToShuffle(SI)) return I; + if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, Builder)) + return I; + // Canonicalize a one-use integer compare with a non-canonical predicate by // inverting the predicate and swapping the select operands. This matches a // compare canonicalization for conditional branches. @@ -1872,7 +2439,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { // See if we are selecting two values based on a comparison of the two values. if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) { - if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) { + Value *Cmp0 = FCI->getOperand(0), *Cmp1 = FCI->getOperand(1); + if ((Cmp0 == TrueVal && Cmp1 == FalseVal) || + (Cmp0 == FalseVal && Cmp1 == TrueVal)) { // Canonicalize to use ordered comparisons by swapping the select // operands. // @@ -1881,30 +2450,12 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { FCmpInst::Predicate InvPred = FCI->getInversePredicate(); IRBuilder<>::FastMathFlagGuard FMFG(Builder); + // FIXME: The FMF should propagate from the select, not the fcmp. Builder.setFastMathFlags(FCI->getFastMathFlags()); - Value *NewCond = Builder.CreateFCmp(InvPred, TrueVal, FalseVal, + Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1, FCI->getName() + ".inv"); - - return SelectInst::Create(NewCond, FalseVal, TrueVal, - SI.getName() + ".p"); - } - - // NOTE: if we wanted to, this is where to detect MIN/MAX - } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){ - // Canonicalize to use ordered comparisons by swapping the select - // operands. - // - // e.g. - // (X ugt Y) ? X : Y -> (X ole Y) ? X : Y - if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) { - FCmpInst::Predicate InvPred = FCI->getInversePredicate(); - IRBuilder<>::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(FCI->getFastMathFlags()); - Value *NewCond = Builder.CreateFCmp(InvPred, FalseVal, TrueVal, - FCI->getName() + ".inv"); - - return SelectInst::Create(NewCond, FalseVal, TrueVal, - SI.getName() + ".p"); + Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal); + return replaceInstUsesWith(SI, NewSel); } // NOTE: if we wanted to, this is where to detect MIN/MAX @@ -1967,6 +2518,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *Add = foldAddSubSelect(SI, Builder)) return Add; + if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder)) + return Add; // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) auto *TI = dyn_cast<Instruction>(TrueVal); @@ -2013,16 +2566,17 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { (LHS->getType()->isFPOrFPVectorTy() && ((CmpLHS != LHS && CmpLHS != RHS) || (CmpRHS != LHS && CmpRHS != RHS)))) { - CmpInst::Predicate Pred = getMinMaxPred(SPF, SPR.Ordered); + CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered); Value *Cmp; - if (CmpInst::isIntPredicate(Pred)) { - Cmp = Builder.CreateICmp(Pred, LHS, RHS); + if (CmpInst::isIntPredicate(MinMaxPred)) { + Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS); } else { IRBuilder<>::FastMathFlagGuard FMFG(Builder); - auto FMF = cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); + auto FMF = + cast<FPMathOperator>(SI.getCondition())->getFastMathFlags(); Builder.setFastMathFlags(FMF); - Cmp = Builder.CreateFCmp(Pred, LHS, RHS); + Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS); } Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI); @@ -2040,9 +2594,9 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * { Value *A; if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) && - !IsFreeToInvert(A, A->hasOneUse()) && + !isFreeToInvert(A, A->hasOneUse()) && // Passing false to only consider m_Not and constants. - IsFreeToInvert(Y, false)) { + isFreeToInvert(Y, false)) { Value *B = Builder.CreateNot(Y); Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF), A, B); @@ -2070,6 +2624,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder)) return I; + if (Instruction *I = matchSAddSubSat(SI)) + return I; } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp index c821292400cd..fbff5dd4a8cd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -25,50 +25,332 @@ using namespace PatternMatch; // we should rewrite it as // x shiftopcode (Q+K) iff (Q+K) u< bitwidth(x) // This is valid for any shift, but they must be identical. -static Instruction * -reassociateShiftAmtsOfTwoSameDirectionShifts(BinaryOperator *Sh0, - const SimplifyQuery &SQ) { - // Look for: (x shiftopcode ShAmt0) shiftopcode ShAmt1 - Value *X, *ShAmt1, *ShAmt0; +// +// AnalyzeForSignBitExtraction indicates that we will only analyze whether this +// pattern has any 2 right-shifts that sum to 1 less than original bit width. +Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts( + BinaryOperator *Sh0, const SimplifyQuery &SQ, + bool AnalyzeForSignBitExtraction) { + // Look for a shift of some instruction, ignore zext of shift amount if any. + Instruction *Sh0Op0; + Value *ShAmt0; + if (!match(Sh0, + m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0))))) + return nullptr; + + // If there is a truncation between the two shifts, we must make note of it + // and look through it. The truncation imposes additional constraints on the + // transform. Instruction *Sh1; - if (!match(Sh0, m_Shift(m_CombineAnd(m_Shift(m_Value(X), m_Value(ShAmt1)), - m_Instruction(Sh1)), - m_Value(ShAmt0)))) + Value *Trunc = nullptr; + match(Sh0Op0, + m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)), + m_Instruction(Sh1))); + + // Inner shift: (x shiftopcode ShAmt1) + // Like with other shift, ignore zext of shift amount if any. + Value *X, *ShAmt1; + if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1))))) + return nullptr; + + // We have two shift amounts from two different shifts. The types of those + // shift amounts may not match. If that's the case let's bailout now.. + if (ShAmt0->getType() != ShAmt1->getType()) + return nullptr; + + // We are only looking for signbit extraction if we have two right shifts. + bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) && + match(Sh1, m_Shr(m_Value(), m_Value())); + // ... and if it's not two right-shifts, we know the answer already. + if (AnalyzeForSignBitExtraction && !HadTwoRightShifts) return nullptr; - // The shift opcodes must be identical. + // The shift opcodes must be identical, unless we are just checking whether + // this pattern can be interpreted as a sign-bit-extraction. Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode(); - if (ShiftOpcode != Sh1->getOpcode()) + bool IdenticalShOpcodes = Sh0->getOpcode() == Sh1->getOpcode(); + if (!IdenticalShOpcodes && !AnalyzeForSignBitExtraction) + return nullptr; + + // If we saw truncation, we'll need to produce extra instruction, + // and for that one of the operands of the shift must be one-use, + // unless of course we don't actually plan to produce any instructions here. + if (Trunc && !AnalyzeForSignBitExtraction && + !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) return nullptr; + // Can we fold (ShAmt0+ShAmt1) ? - Value *NewShAmt = SimplifyBinOp(Instruction::BinaryOps::Add, ShAmt0, ShAmt1, - SQ.getWithInstruction(Sh0)); + auto *NewShAmt = dyn_cast_or_null<Constant>( + SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false, + SQ.getWithInstruction(Sh0))); if (!NewShAmt) return nullptr; // Did not simplify. - // Is the new shift amount smaller than the bit width? - // FIXME: could also rely on ConstantRange. - unsigned BitWidth = X->getType()->getScalarSizeInBits(); + unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits(); + unsigned XBitWidth = X->getType()->getScalarSizeInBits(); + // Is the new shift amount smaller than the bit width of inner/new shift? if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT, - APInt(BitWidth, BitWidth)))) - return nullptr; + APInt(NewShAmtBitWidth, XBitWidth)))) + return nullptr; // FIXME: could perform constant-folding. + + // If there was a truncation, and we have a right-shift, we can only fold if + // we are left with the original sign bit. Likewise, if we were just checking + // that this is a sighbit extraction, this is the place to check it. + // FIXME: zero shift amount is also legal here, but we can't *easily* check + // more than one predicate so it's not really worth it. + if (HadTwoRightShifts && (Trunc || AnalyzeForSignBitExtraction)) { + // If it's not a sign bit extraction, then we're done. + if (!match(NewShAmt, + m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(NewShAmtBitWidth, XBitWidth - 1)))) + return nullptr; + // If it is, and that was the question, return the base value. + if (AnalyzeForSignBitExtraction) + return X; + } + + assert(IdenticalShOpcodes && "Should not get here with different shifts."); + // All good, we can do this fold. + NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType()); + BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt); - // If both of the original shifts had the same flag set, preserve the flag. - if (ShiftOpcode == Instruction::BinaryOps::Shl) { - NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() && - Sh1->hasNoUnsignedWrap()); - NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() && - Sh1->hasNoSignedWrap()); - } else { - NewShift->setIsExact(Sh0->isExact() && Sh1->isExact()); + + // The flags can only be propagated if there wasn't a trunc. + if (!Trunc) { + // If the pattern did not involve trunc, and both of the original shifts + // had the same flag set, preserve the flag. + if (ShiftOpcode == Instruction::BinaryOps::Shl) { + NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() && + Sh1->hasNoUnsignedWrap()); + NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() && + Sh1->hasNoSignedWrap()); + } else { + NewShift->setIsExact(Sh0->isExact() && Sh1->isExact()); + } } - return NewShift; + + Instruction *Ret = NewShift; + if (Trunc) { + Builder.Insert(NewShift); + Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType()); + } + + return Ret; +} + +// If we have some pattern that leaves only some low bits set, and then performs +// left-shift of those bits, if none of the bits that are left after the final +// shift are modified by the mask, we can omit the mask. +// +// There are many variants to this pattern: +// a) (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt +// b) (x & (~(-1 << MaskShAmt))) << ShiftShAmt +// c) (x & (-1 >> MaskShAmt)) << ShiftShAmt +// d) (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt +// e) ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt +// f) ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt +// All these patterns can be simplified to just: +// x << ShiftShAmt +// iff: +// a,b) (MaskShAmt+ShiftShAmt) u>= bitwidth(x) +// c,d,e,f) (ShiftShAmt-MaskShAmt) s>= 0 (i.e. ShiftShAmt u>= MaskShAmt) +static Instruction * +dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift, + const SimplifyQuery &Q, + InstCombiner::BuilderTy &Builder) { + assert(OuterShift->getOpcode() == Instruction::BinaryOps::Shl && + "The input must be 'shl'!"); + + Value *Masked, *ShiftShAmt; + match(OuterShift, + m_Shift(m_Value(Masked), m_ZExtOrSelf(m_Value(ShiftShAmt)))); + + // *If* there is a truncation between an outer shift and a possibly-mask, + // then said truncation *must* be one-use, else we can't perform the fold. + Value *Trunc; + if (match(Masked, m_CombineAnd(m_Trunc(m_Value(Masked)), m_Value(Trunc))) && + !Trunc->hasOneUse()) + return nullptr; + + Type *NarrowestTy = OuterShift->getType(); + Type *WidestTy = Masked->getType(); + bool HadTrunc = WidestTy != NarrowestTy; + + // The mask must be computed in a type twice as wide to ensure + // that no bits are lost if the sum-of-shifts is wider than the base type. + Type *ExtendedTy = WidestTy->getExtendedType(); + + Value *MaskShAmt; + + // ((1 << MaskShAmt) - 1) + auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes()); + // (~(-1 << maskNbits)) + auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes()); + // (-1 >> MaskShAmt) + auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt)); + // ((-1 << MaskShAmt) >> MaskShAmt) + auto MaskD = + m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt)); + + Value *X; + Constant *NewMask; + + if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) { + // Peek through an optional zext of the shift amount. + match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt))); + + // We have two shift amounts from two different shifts. The types of those + // shift amounts may not match. If that's the case let's bailout now. + if (MaskShAmt->getType() != ShiftShAmt->getType()) + return nullptr; + + // Can we simplify (MaskShAmt+ShiftShAmt) ? + auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst( + MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q)); + if (!SumOfShAmts) + return nullptr; // Did not simplify. + // In this pattern SumOfShAmts correlates with the number of low bits + // that shall remain in the root value (OuterShift). + + // An extend of an undef value becomes zero because the high bits are never + // completely unknown. Replace the the `undef` shift amounts with final + // shift bitwidth to ensure that the value remains undef when creating the + // subsequent shift op. + SumOfShAmts = Constant::replaceUndefsWith( + SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(), + ExtendedTy->getScalarSizeInBits())); + auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy); + // And compute the mask as usual: ~(-1 << (SumOfShAmts)) + auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy); + auto *ExtendedInvertedMask = + ConstantExpr::getShl(ExtendedAllOnes, ExtendedSumOfShAmts); + NewMask = ConstantExpr::getNot(ExtendedInvertedMask); + } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) || + match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)), + m_Deferred(MaskShAmt)))) { + // Peek through an optional zext of the shift amount. + match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt))); + + // We have two shift amounts from two different shifts. The types of those + // shift amounts may not match. If that's the case let's bailout now. + if (MaskShAmt->getType() != ShiftShAmt->getType()) + return nullptr; + + // Can we simplify (ShiftShAmt-MaskShAmt) ? + auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst( + ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q)); + if (!ShAmtsDiff) + return nullptr; // Did not simplify. + // In this pattern ShAmtsDiff correlates with the number of high bits that + // shall be unset in the root value (OuterShift). + + // An extend of an undef value becomes zero because the high bits are never + // completely unknown. Replace the the `undef` shift amounts with negated + // bitwidth of innermost shift to ensure that the value remains undef when + // creating the subsequent shift op. + unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits(); + ShAmtsDiff = Constant::replaceUndefsWith( + ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(), + -WidestTyBitWidth)); + auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt( + ConstantExpr::getSub(ConstantInt::get(ShAmtsDiff->getType(), + WidestTyBitWidth, + /*isSigned=*/false), + ShAmtsDiff), + ExtendedTy); + // And compute the mask as usual: (-1 l>> (NumHighBitsToClear)) + auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy); + NewMask = + ConstantExpr::getLShr(ExtendedAllOnes, ExtendedNumHighBitsToClear); + } else + return nullptr; // Don't know anything about this pattern. + + NewMask = ConstantExpr::getTrunc(NewMask, NarrowestTy); + + // Does this mask has any unset bits? If not then we can just not apply it. + bool NeedMask = !match(NewMask, m_AllOnes()); + + // If we need to apply a mask, there are several more restrictions we have. + if (NeedMask) { + // The old masking instruction must go away. + if (!Masked->hasOneUse()) + return nullptr; + // The original "masking" instruction must not have been`ashr`. + if (match(Masked, m_AShr(m_Value(), m_Value()))) + return nullptr; + } + + // If we need to apply truncation, let's do it first, since we can. + // We have already ensured that the old truncation will go away. + if (HadTrunc) + X = Builder.CreateTrunc(X, NarrowestTy); + + // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits. + // We didn't change the Type of this outermost shift, so we can just do it. + auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X, + OuterShift->getOperand(1)); + if (!NeedMask) + return NewShift; + + Builder.Insert(NewShift); + return BinaryOperator::Create(Instruction::And, NewShift, NewMask); +} + +/// If we have a shift-by-constant of a bitwise logic op that itself has a +/// shift-by-constant operand with identical opcode, we may be able to convert +/// that into 2 independent shifts followed by the logic op. This eliminates a +/// a use of an intermediate value (reduces dependency chain). +static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I, + InstCombiner::BuilderTy &Builder) { + assert(I.isShift() && "Expected a shift as input"); + auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0)); + if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse()) + return nullptr; + + const APInt *C0, *C1; + if (!match(I.getOperand(1), m_APInt(C1))) + return nullptr; + + Instruction::BinaryOps ShiftOpcode = I.getOpcode(); + Type *Ty = I.getType(); + + // Find a matching one-use shift by constant. The fold is not valid if the sum + // of the shift values equals or exceeds bitwidth. + // TODO: Remove the one-use check if the other logic operand (Y) is constant. + Value *X, *Y; + auto matchFirstShift = [&](Value *V) { + return !isa<ConstantExpr>(V) && + match(V, m_OneUse(m_Shift(m_Value(X), m_APInt(C0)))) && + cast<BinaryOperator>(V)->getOpcode() == ShiftOpcode && + (*C0 + *C1).ult(Ty->getScalarSizeInBits()); + }; + + // Logic ops are commutative, so check each operand for a match. + if (matchFirstShift(LogicInst->getOperand(0))) + Y = LogicInst->getOperand(1); + else if (matchFirstShift(LogicInst->getOperand(1))) + Y = LogicInst->getOperand(0); + else + return nullptr; + + // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1) + Constant *ShiftSumC = ConstantInt::get(Ty, *C0 + *C1); + Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC); + Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1)); + return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2); } Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); assert(Op0->getType() == Op1->getType()); + // If the shift amount is a one-use `sext`, we can demote it to `zext`. + Value *Y; + if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) { + Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName()); + return BinaryOperator::Create(I.getOpcode(), Op0, NewExt); + } + // See if we can fold away this shift. if (SimplifyDemandedInstructionBits(I)) return &I; @@ -83,8 +365,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I)) return Res; - if (Instruction *NewShift = - reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ)) + if (auto *NewShift = cast_or_null<Instruction>( + reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ))) return NewShift; // (C1 shift (A add C2)) -> (C1 shift C2) shift A) @@ -110,6 +392,9 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) { return &I; } + if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder)) + return Logic; + return nullptr; } @@ -618,9 +903,10 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1, } Instruction *InstCombiner::visitShl(BinaryOperator &I) { + const SimplifyQuery Q = SQ.getWithInstruction(&I); + if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1), - I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), - SQ.getWithInstruction(&I))) + I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q)) return replaceInstUsesWith(I, V); if (Instruction *X = foldVectorBinop(I)) @@ -629,6 +915,9 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { if (Instruction *V = commonShiftTransforms(I)) return V; + if (Instruction *V = dropRedundantMaskingOfLeftShiftInput(&I, Q, Builder)) + return V; + Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); Type *Ty = I.getType(); unsigned BitWidth = Ty->getScalarSizeInBits(); @@ -636,12 +925,11 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { const APInt *ShAmtAPInt; if (match(Op1, m_APInt(ShAmtAPInt))) { unsigned ShAmt = ShAmtAPInt->getZExtValue(); - unsigned BitWidth = Ty->getScalarSizeInBits(); // shl (zext X), ShAmt --> zext (shl X, ShAmt) // This is only valid if X would have zeros shifted out. Value *X; - if (match(Op0, m_ZExt(m_Value(X)))) { + if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) { unsigned SrcWidth = X->getType()->getScalarSizeInBits(); if (ShAmt < SrcWidth && MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I)) @@ -719,6 +1007,12 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) { // (X * C2) << C1 --> X * (C2 << C1) if (match(Op0, m_Mul(m_Value(X), m_Constant(C2)))) return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1)); + + // shl (zext i1 X), C1 --> select (X, 1 << C1, 0) + if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) { + auto *NewC = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C1); + return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty)); + } } // (1 << (C - x)) -> ((1 << C) >> x) if C is bitwidth - 1 @@ -859,6 +1153,75 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { return nullptr; } +Instruction * +InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr) { + assert(OldAShr.getOpcode() == Instruction::AShr && + "Must be called with arithmetic right-shift instruction only."); + + // Check that constant C is a splat of the element-wise bitwidth of V. + auto BitWidthSplat = [](Constant *C, Value *V) { + return match( + C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(C->getType()->getScalarSizeInBits(), + V->getType()->getScalarSizeInBits()))); + }; + + // It should look like variable-length sign-extension on the outside: + // (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits) + Value *NBits; + Instruction *MaybeTrunc; + Constant *C1, *C2; + if (!match(&OldAShr, + m_AShr(m_Shl(m_Instruction(MaybeTrunc), + m_ZExtOrSelf(m_Sub(m_Constant(C1), + m_ZExtOrSelf(m_Value(NBits))))), + m_ZExtOrSelf(m_Sub(m_Constant(C2), + m_ZExtOrSelf(m_Deferred(NBits)))))) || + !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr)) + return nullptr; + + // There may or may not be a truncation after outer two shifts. + Instruction *HighBitExtract; + match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract))); + bool HadTrunc = MaybeTrunc != HighBitExtract; + + // And finally, the innermost part of the pattern must be a right-shift. + Value *X, *NumLowBitsToSkip; + if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip)))) + return nullptr; + + // Said right-shift must extract high NBits bits - C0 must be it's bitwidth. + Constant *C0; + if (!match(NumLowBitsToSkip, + m_ZExtOrSelf( + m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) || + !BitWidthSplat(C0, HighBitExtract)) + return nullptr; + + // Since the NBits is identical for all shifts, if the outermost and + // innermost shifts are identical, then outermost shifts are redundant. + // If we had truncation, do keep it though. + if (HighBitExtract->getOpcode() == OldAShr.getOpcode()) + return replaceInstUsesWith(OldAShr, MaybeTrunc); + + // Else, if there was a truncation, then we need to ensure that one + // instruction will go away. + if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + + // Finally, bypass two innermost shifts, and perform the outermost shift on + // the operands of the innermost shift. + Instruction *NewAShr = + BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip); + NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness. + if (!HadTrunc) + return NewAShr; + + Builder.Insert(NewAShr); + return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType()); +} + Instruction *InstCombiner::visitAShr(BinaryOperator &I) { if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), SQ.getWithInstruction(&I))) @@ -933,6 +1296,9 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { } } + if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I)) + return R; + // See if we can turn a signed shr into an unsigned shr. if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I)) return BinaryOperator::CreateLShr(Op0, Op1); diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index e0d85c4b49ae..47ce83974c8d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -14,6 +14,8 @@ #include "InstCombineInternal.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" @@ -348,8 +350,36 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?"); // If the operands are constants, see if we can simplify them. - if (ShrinkDemandedConstant(I, 1, DemandedMask) || - ShrinkDemandedConstant(I, 2, DemandedMask)) + // This is similar to ShrinkDemandedConstant, but for a select we want to + // try to keep the selected constants the same as icmp value constants, if + // we can. This helps not break apart (or helps put back together) + // canonical patterns like min and max. + auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo, + APInt DemandedMask) { + const APInt *SelC; + if (!match(I->getOperand(OpNo), m_APInt(SelC))) + return false; + + // Get the constant out of the ICmp, if there is one. + const APInt *CmpC; + ICmpInst::Predicate Pred; + if (!match(I->getOperand(0), m_c_ICmp(Pred, m_APInt(CmpC), m_Value())) || + CmpC->getBitWidth() != SelC->getBitWidth()) + return ShrinkDemandedConstant(I, OpNo, DemandedMask); + + // If the constant is already the same as the ICmp, leave it as-is. + if (*CmpC == *SelC) + return false; + // If the constants are not already the same, but can be with the demand + // mask, use the constant value from the ICmp. + if ((*CmpC & DemandedMask) == (*SelC & DemandedMask)) { + I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC)); + return true; + } + return ShrinkDemandedConstant(I, OpNo, DemandedMask); + }; + if (CanonicalizeSelectConstant(I, 1, DemandedMask) || + CanonicalizeSelectConstant(I, 2, DemandedMask)) return I; // Only known if known in both the LHS and RHS. @@ -971,6 +1001,13 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, APInt DemandedElts, int DMaskIdx) { + + // FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported. + if (DMaskIdx < 0 && + II->getType()->getScalarSizeInBits() != 32 && + DemandedElts.getActiveBits() == 3) + return nullptr; + unsigned VWidth = II->getType()->getVectorNumElements(); if (VWidth == 1) return nullptr; @@ -1067,16 +1104,22 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, } /// The specified value produces a vector with any number of elements. +/// This method analyzes which elements of the operand are undef and returns +/// that information in UndefElts. +/// /// DemandedElts contains the set of elements that are actually used by the -/// caller. This method analyzes which elements of the operand are undef and -/// returns that information in UndefElts. +/// caller, and by default (AllowMultipleUsers equals false) the value is +/// simplified only if it has a single caller. If AllowMultipleUsers is set +/// to true, DemandedElts refers to the union of sets of elements that are +/// used by all callers. /// /// If the information about demanded elements can be used to simplify the /// operation, the operation is simplified, then the resultant value is /// returned. This returns null if no change was made. Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts, - unsigned Depth) { + unsigned Depth, + bool AllowMultipleUsers) { unsigned VWidth = V->getType()->getVectorNumElements(); APInt EltMask(APInt::getAllOnesValue(VWidth)); assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!"); @@ -1130,19 +1173,21 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, if (Depth == 10) return nullptr; - // If multiple users are using the root value, proceed with - // simplification conservatively assuming that all elements - // are needed. - if (!V->hasOneUse()) { - // Quit if we find multiple users of a non-root value though. - // They'll be handled when it's their turn to be visited by - // the main instcombine process. - if (Depth != 0) - // TODO: Just compute the UndefElts information recursively. - return nullptr; + if (!AllowMultipleUsers) { + // If multiple users are using the root value, proceed with + // simplification conservatively assuming that all elements + // are needed. + if (!V->hasOneUse()) { + // Quit if we find multiple users of a non-root value though. + // They'll be handled when it's their turn to be visited by + // the main instcombine process. + if (Depth != 0) + // TODO: Just compute the UndefElts information recursively. + return nullptr; - // Conservatively assume that all elements are needed. - DemandedElts = EltMask; + // Conservatively assume that all elements are needed. + DemandedElts = EltMask; + } } Instruction *I = dyn_cast<Instruction>(V); @@ -1232,30 +1277,57 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, break; } case Instruction::ShuffleVector: { - ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I); - unsigned LHSVWidth = - Shuffle->getOperand(0)->getType()->getVectorNumElements(); - APInt LeftDemanded(LHSVWidth, 0), RightDemanded(LHSVWidth, 0); + auto *Shuffle = cast<ShuffleVectorInst>(I); + assert(Shuffle->getOperand(0)->getType() == + Shuffle->getOperand(1)->getType() && + "Expected shuffle operands to have same type"); + unsigned OpWidth = + Shuffle->getOperand(0)->getType()->getVectorNumElements(); + APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0); for (unsigned i = 0; i < VWidth; i++) { if (DemandedElts[i]) { unsigned MaskVal = Shuffle->getMaskValue(i); if (MaskVal != -1u) { - assert(MaskVal < LHSVWidth * 2 && + assert(MaskVal < OpWidth * 2 && "shufflevector mask index out of range!"); - if (MaskVal < LHSVWidth) + if (MaskVal < OpWidth) LeftDemanded.setBit(MaskVal); else - RightDemanded.setBit(MaskVal - LHSVWidth); + RightDemanded.setBit(MaskVal - OpWidth); } } } - APInt LHSUndefElts(LHSVWidth, 0); + APInt LHSUndefElts(OpWidth, 0); simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts); - APInt RHSUndefElts(LHSVWidth, 0); + APInt RHSUndefElts(OpWidth, 0); simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts); + // If this shuffle does not change the vector length and the elements + // demanded by this shuffle are an identity mask, then this shuffle is + // unnecessary. + // + // We are assuming canonical form for the mask, so the source vector is + // operand 0 and operand 1 is not used. + // + // Note that if an element is demanded and this shuffle mask is undefined + // for that element, then the shuffle is not considered an identity + // operation. The shuffle prevents poison from the operand vector from + // leaking to the result by replacing poison with an undefined value. + if (VWidth == OpWidth) { + bool IsIdentityShuffle = true; + for (unsigned i = 0; i < VWidth; i++) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (DemandedElts[i] && i != MaskVal) { + IsIdentityShuffle = false; + break; + } + } + if (IsIdentityShuffle) + return Shuffle->getOperand(0); + } + bool NewUndefElts = false; unsigned LHSIdx = -1u, LHSValIdx = -1u; unsigned RHSIdx = -1u, RHSValIdx = -1u; @@ -1268,23 +1340,23 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } else if (!DemandedElts[i]) { NewUndefElts = true; UndefElts.setBit(i); - } else if (MaskVal < LHSVWidth) { + } else if (MaskVal < OpWidth) { if (LHSUndefElts[MaskVal]) { NewUndefElts = true; UndefElts.setBit(i); } else { - LHSIdx = LHSIdx == -1u ? i : LHSVWidth; - LHSValIdx = LHSValIdx == -1u ? MaskVal : LHSVWidth; + LHSIdx = LHSIdx == -1u ? i : OpWidth; + LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth; LHSUniform = LHSUniform && (MaskVal == i); } } else { - if (RHSUndefElts[MaskVal - LHSVWidth]) { + if (RHSUndefElts[MaskVal - OpWidth]) { NewUndefElts = true; UndefElts.setBit(i); } else { - RHSIdx = RHSIdx == -1u ? i : LHSVWidth; - RHSValIdx = RHSValIdx == -1u ? MaskVal - LHSVWidth : LHSVWidth; - RHSUniform = RHSUniform && (MaskVal - LHSVWidth == i); + RHSIdx = RHSIdx == -1u ? i : OpWidth; + RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth; + RHSUniform = RHSUniform && (MaskVal - OpWidth == i); } } } @@ -1293,20 +1365,20 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // this constant vector to single insertelement instruction. // shufflevector V, C, <v1, v2, .., ci, .., vm> -> // insertelement V, C[ci], ci-n - if (LHSVWidth == Shuffle->getType()->getNumElements()) { + if (OpWidth == Shuffle->getType()->getNumElements()) { Value *Op = nullptr; Constant *Value = nullptr; unsigned Idx = -1u; // Find constant vector with the single element in shuffle (LHS or RHS). - if (LHSIdx < LHSVWidth && RHSUniform) { + if (LHSIdx < OpWidth && RHSUniform) { if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) { Op = Shuffle->getOperand(1); Value = CV->getOperand(LHSValIdx); Idx = LHSIdx; } } - if (RHSIdx < LHSVWidth && LHSUniform) { + if (RHSIdx < OpWidth && LHSUniform) { if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) { Op = Shuffle->getOperand(0); Value = CV->getOperand(RHSValIdx); @@ -1674,8 +1746,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, case Intrinsic::amdgcn_buffer_load_format: case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_raw_tbuffer_load: case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: + case Intrinsic::amdgcn_struct_tbuffer_load: + case Intrinsic::amdgcn_tbuffer_load: return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts); default: { if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID())) diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index dc9abdd7f47a..f604c9dc32ca 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -253,6 +253,69 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext, return nullptr; } +/// Find elements of V demanded by UserInstr. +static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) { + unsigned VWidth = V->getType()->getVectorNumElements(); + + // Conservatively assume that all elements are needed. + APInt UsedElts(APInt::getAllOnesValue(VWidth)); + + switch (UserInstr->getOpcode()) { + case Instruction::ExtractElement: { + ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr); + assert(EEI->getVectorOperand() == V); + ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand()); + if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) { + UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue()); + } + break; + } + case Instruction::ShuffleVector: { + ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr); + unsigned MaskNumElts = UserInstr->getType()->getVectorNumElements(); + + UsedElts = APInt(VWidth, 0); + for (unsigned i = 0; i < MaskNumElts; i++) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (MaskVal == -1u || MaskVal >= 2 * VWidth) + continue; + if (Shuffle->getOperand(0) == V && (MaskVal < VWidth)) + UsedElts.setBit(MaskVal); + if (Shuffle->getOperand(1) == V && + ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth))) + UsedElts.setBit(MaskVal - VWidth); + } + break; + } + default: + break; + } + return UsedElts; +} + +/// Find union of elements of V demanded by all its users. +/// If it is known by querying findDemandedEltsBySingleUser that +/// no user demands an element of V, then the corresponding bit +/// remains unset in the returned value. +static APInt findDemandedEltsByAllUsers(Value *V) { + unsigned VWidth = V->getType()->getVectorNumElements(); + + APInt UnionUsedElts(VWidth, 0); + for (const Use &U : V->uses()) { + if (Instruction *I = dyn_cast<Instruction>(U.getUser())) { + UnionUsedElts |= findDemandedEltsBySingleUser(V, I); + } else { + UnionUsedElts = APInt::getAllOnesValue(VWidth); + break; + } + + if (UnionUsedElts.isAllOnesValue()) + break; + } + + return UnionUsedElts; +} + Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { Value *SrcVec = EI.getVectorOperand(); Value *Index = EI.getIndexOperand(); @@ -271,19 +334,35 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) { return nullptr; // This instruction only demands the single element from the input vector. - // If the input vector has a single use, simplify it based on this use - // property. - if (SrcVec->hasOneUse() && NumElts != 1) { - APInt UndefElts(NumElts, 0); - APInt DemandedElts(NumElts, 0); - DemandedElts.setBit(IndexC->getZExtValue()); - if (Value *V = SimplifyDemandedVectorElts(SrcVec, DemandedElts, - UndefElts)) { - EI.setOperand(0, V); - return &EI; + if (NumElts != 1) { + // If the input vector has a single use, simplify it based on this use + // property. + if (SrcVec->hasOneUse()) { + APInt UndefElts(NumElts, 0); + APInt DemandedElts(NumElts, 0); + DemandedElts.setBit(IndexC->getZExtValue()); + if (Value *V = + SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) { + EI.setOperand(0, V); + return &EI; + } + } else { + // If the input vector has multiple uses, simplify it based on a union + // of all elements used. + APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec); + if (!DemandedElts.isAllOnesValue()) { + APInt UndefElts(NumElts, 0); + if (Value *V = SimplifyDemandedVectorElts( + SrcVec, DemandedElts, UndefElts, 0 /* Depth */, + true /* AllowMultipleUsers */)) { + if (V != SrcVec) { + SrcVec->replaceAllUsesWith(V); + return &EI; + } + } + } } } - if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian())) return I; @@ -766,6 +845,55 @@ static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) { return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask); } +/// Try to fold an extract+insert element into an existing identity shuffle by +/// changing the shuffle's mask to include the index of this insert element. +static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) { + // Check if the vector operand of this insert is an identity shuffle. + auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0)); + if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) || + !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding())) + return nullptr; + + // Check for a constant insertion index. + uint64_t IdxC; + if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC))) + return nullptr; + + // Check if this insert's scalar op is extracted from the identity shuffle's + // input vector. + Value *Scalar = InsElt.getOperand(1); + Value *X = Shuf->getOperand(0); + if (!match(Scalar, m_ExtractElement(m_Specific(X), m_SpecificInt(IdxC)))) + return nullptr; + + // Replace the shuffle mask element at the index of this extract+insert with + // that same index value. + // For example: + // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask' + unsigned NumMaskElts = Shuf->getType()->getVectorNumElements(); + SmallVector<Constant *, 16> NewMaskVec(NumMaskElts); + Type *I32Ty = IntegerType::getInt32Ty(Shuf->getContext()); + Constant *NewMaskEltC = ConstantInt::get(I32Ty, IdxC); + Constant *OldMask = Shuf->getMask(); + for (unsigned i = 0; i != NumMaskElts; ++i) { + if (i != IdxC) { + // All mask elements besides the inserted element remain the same. + NewMaskVec[i] = OldMask->getAggregateElement(i); + } else if (OldMask->getAggregateElement(i) == NewMaskEltC) { + // If the mask element was already set, there's nothing to do + // (demanded elements analysis may unset it later). + return nullptr; + } else { + assert(isa<UndefValue>(OldMask->getAggregateElement(i)) && + "Unexpected shuffle mask element for identity shuffle"); + NewMaskVec[i] = NewMaskEltC; + } + } + + Constant *NewMask = ConstantVector::get(NewMaskVec); + return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask); +} + /// If we have an insertelement instruction feeding into another insertelement /// and the 2nd is inserting a constant into the vector, canonicalize that /// constant insertion before the insertion of a variable: @@ -987,6 +1115,9 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) { if (Instruction *Splat = foldInsEltIntoSplat(IE)) return Splat; + if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE)) + return IdentityShuf; + return nullptr; } @@ -1009,17 +1140,23 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask, if (Depth == 0) return false; switch (I->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + // Propagating an undefined shuffle mask element to integer div/rem is not + // allowed because those opcodes can create immediate undefined behavior + // from an undefined element in an operand. + if (llvm::any_of(Mask, [](int M){ return M == -1; })) + return false; + LLVM_FALLTHROUGH; case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: @@ -1040,9 +1177,7 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask, case Instruction::FPExt: case Instruction::GetElementPtr: { // Bail out if we would create longer vector ops. We could allow creating - // longer vector ops, but that may result in more expensive codegen. We - // would also need to limit the transform to avoid undefined behavior for - // integer div/rem. + // longer vector ops, but that may result in more expensive codegen. Type *ITy = I->getType(); if (ITy->isVectorTy() && Mask.size() > ITy->getVectorNumElements()) return false; @@ -1255,20 +1390,6 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) { llvm_unreachable("failed to reorder elements of vector instruction!"); } -static void recognizeIdentityMask(const SmallVectorImpl<int> &Mask, - bool &isLHSID, bool &isRHSID) { - isLHSID = isRHSID = true; - - for (unsigned i = 0, e = Mask.size(); i != e; ++i) { - if (Mask[i] < 0) continue; // Ignore undef values. - // Is this an identity shuffle of the LHS value? - isLHSID &= (Mask[i] == (int)i); - - // Is this an identity shuffle of the RHS value? - isRHSID &= (Mask[i]-e == i); - } -} - // Returns true if the shuffle is extracting a contiguous range of values from // LHS, for example: // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+ @@ -1425,9 +1546,11 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf, if (!Shuf.isSelect()) return nullptr; - // Canonicalize to choose from operand 0 first. + // Canonicalize to choose from operand 0 first unless operand 1 is undefined. + // Commuting undef to operand 0 conflicts with another canonicalization. unsigned NumElts = Shuf.getType()->getVectorNumElements(); - if (Shuf.getMaskValue(0) >= (int)NumElts) { + if (!isa<UndefValue>(Shuf.getOperand(1)) && + Shuf.getMaskValue(0) >= (int)NumElts) { // TODO: Can we assert that both operands of a shuffle-select are not undef // (otherwise, it would have been folded by instsimplify? Shuf.commute(); @@ -1618,7 +1741,8 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) { return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask)); } -/// Try to replace a shuffle with an insertelement. +/// Try to replace a shuffle with an insertelement or try to replace a shuffle +/// operand with the operand of an insertelement. static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) { Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1); SmallVector<int, 16> Mask = Shuf.getShuffleMask(); @@ -1630,6 +1754,31 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) { if (NumElts != (int)(V0->getType()->getVectorNumElements())) return nullptr; + // This is a specialization of a fold in SimplifyDemandedVectorElts. We may + // not be able to handle it there if the insertelement has >1 use. + // If the shuffle has an insertelement operand but does not choose the + // inserted scalar element from that value, then we can replace that shuffle + // operand with the source vector of the insertelement. + Value *X; + uint64_t IdxC; + if (match(V0, m_InsertElement(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) { + // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask + if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; })) { + Shuf.setOperand(0, X); + return &Shuf; + } + } + if (match(V1, m_InsertElement(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) { + // Offset the index constant by the vector width because we are checking for + // accesses to the 2nd vector input of the shuffle. + IdxC += NumElts; + // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask + if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; })) { + Shuf.setOperand(1, X); + return &Shuf; + } + } + // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC' auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) { // We need an insertelement with a constant index. @@ -1756,29 +1905,21 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI))) return replaceInstUsesWith(SVI, V); - // Canonicalize shuffle(x ,x,mask) -> shuffle(x, undef,mask') - // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask'). + // shuffle x, x, mask --> shuffle x, undef, mask' unsigned VWidth = SVI.getType()->getVectorNumElements(); unsigned LHSWidth = LHS->getType()->getVectorNumElements(); SmallVector<int, 16> Mask = SVI.getShuffleMask(); Type *Int32Ty = Type::getInt32Ty(SVI.getContext()); - if (LHS == RHS || isa<UndefValue>(LHS)) { + if (LHS == RHS) { + assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?"); // Remap any references to RHS to use LHS. SmallVector<Constant*, 16> Elts; - for (unsigned i = 0, e = LHSWidth; i != VWidth; ++i) { - if (Mask[i] < 0) { - Elts.push_back(UndefValue::get(Int32Ty)); - continue; - } - - if ((Mask[i] >= (int)e && isa<UndefValue>(RHS)) || - (Mask[i] < (int)e && isa<UndefValue>(LHS))) { - Mask[i] = -1; // Turn into undef. + for (unsigned i = 0; i != VWidth; ++i) { + // Propagate undef elements or force mask to LHS. + if (Mask[i] < 0) Elts.push_back(UndefValue::get(Int32Ty)); - } else { - Mask[i] = Mask[i] % e; // Force to LHS. - Elts.push_back(ConstantInt::get(Int32Ty, Mask[i])); - } + else + Elts.push_back(ConstantInt::get(Int32Ty, Mask[i] % LHSWidth)); } SVI.setOperand(0, SVI.getOperand(1)); SVI.setOperand(1, UndefValue::get(RHS->getType())); @@ -1786,6 +1927,12 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return &SVI; } + // shuffle undef, x, mask --> shuffle x, undef, mask' + if (isa<UndefValue>(LHS)) { + SVI.commute(); + return &SVI; + } + if (Instruction *I = canonicalizeInsertSplat(SVI, Builder)) return I; @@ -1813,16 +1960,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (Instruction *I = foldIdentityPaddedShuffles(SVI)) return I; - if (VWidth == LHSWidth) { - // Analyze the shuffle, are the LHS or RHS and identity shuffles? - bool isLHSID, isRHSID; - recognizeIdentityMask(Mask, isLHSID, isRHSID); - - // Eliminate identity shuffles. - if (isLHSID) return replaceInstUsesWith(SVI, LHS); - if (isRHSID) return replaceInstUsesWith(SVI, RHS); - } - if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) { Value *V = evaluateInDifferentElementOrder(LHS, Mask); return replaceInstUsesWith(SVI, V); @@ -2100,12 +2237,5 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) { return new ShuffleVectorInst(newLHS, newRHS, ConstantVector::get(Elts)); } - // If the result mask is an identity, replace uses of this instruction with - // corresponding argument. - bool isLHSID, isRHSID; - recognizeIdentityMask(newMask, isLHSID, isRHSID); - if (isLHSID && VWidth == LHSOp0Width) return replaceInstUsesWith(SVI, newLHS); - if (isRHSID && VWidth == RHSOp0Width) return replaceInstUsesWith(SVI, newRHS); - return MadeChange ? &SVI : nullptr; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 385f4926b845..bf32996d96e2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -86,6 +86,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CBindingWrapping.h" #include "llvm/Support/Casting.h" @@ -121,6 +122,9 @@ STATISTIC(NumReassoc , "Number of reassociations"); DEBUG_COUNTER(VisitCounter, "instcombine-visit", "Controls which instructions are visited"); +static constexpr unsigned InstCombineDefaultMaxIterations = 1000; +static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000; + static cl::opt<bool> EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), cl::init(true)); @@ -129,6 +133,17 @@ static cl::opt<bool> EnableExpensiveCombines("expensive-combines", cl::desc("Enable expensive instruction combines")); +static cl::opt<unsigned> LimitMaxIterations( + "instcombine-max-iterations", + cl::desc("Limit the maximum number of instruction combining iterations"), + cl::init(InstCombineDefaultMaxIterations)); + +static cl::opt<unsigned> InfiniteLoopDetectionThreshold( + "instcombine-infinite-loop-threshold", + cl::desc("Number of instruction combining iterations considered an " + "infinite loop"), + cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden); + static cl::opt<unsigned> MaxArraySize("instcombine-maxarray-size", cl::init(1024), cl::desc("Maximum array size considered when doing a combine")); @@ -200,8 +215,8 @@ bool InstCombiner::shouldChangeType(Type *From, Type *To) const { // where both B and C should be ConstantInts, results in a constant that does // not overflow. This function only handles the Add and Sub opcodes. For // all other opcodes, the function conservatively returns false. -static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { - OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I); +static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { + auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); if (!OBO || !OBO->hasNoSignedWrap()) return false; @@ -224,10 +239,15 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { } static bool hasNoUnsignedWrap(BinaryOperator &I) { - OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I); + auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); return OBO && OBO->hasNoUnsignedWrap(); } +static bool hasNoSignedWrap(BinaryOperator &I) { + auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I); + return OBO && OBO->hasNoSignedWrap(); +} + /// Conservatively clears subclassOptionalData after a reassociation or /// commutation. We preserve fast-math flags when applicable as they can be /// preserved. @@ -332,22 +352,21 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { // It simplifies to V. Form "A op V". I.setOperand(0, A); I.setOperand(1, V); - // Conservatively clear the optional flags, since they may not be - // preserved by the reassociation. bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0); - bool IsNSW = MaintainNoSignedWrap(I, B, C); + bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0); + // Conservatively clear all optional flags since they may not be + // preserved by the reassociation. Reset nsw/nuw based on the above + // analysis. ClearSubclassDataAfterReassociation(I); + // Note: this is only valid because SimplifyBinOp doesn't look at + // the operands to Op0. if (IsNUW) I.setHasNoUnsignedWrap(true); - if (IsNSW && - (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) { - // Note: this is only valid because SimplifyBinOp doesn't look at - // the operands to Op0. + if (IsNSW) I.setHasNoSignedWrap(true); - } Changed = true; ++NumReassoc; @@ -610,7 +629,6 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I, HasNUW &= ROBO->hasNoUnsignedWrap(); } - const APInt *CInt; if (TopLevelOpcode == Instruction::Add && InnerOpcode == Instruction::Mul) { // We can propagate 'nsw' if we know that @@ -620,6 +638,7 @@ Value *InstCombiner::tryFactorization(BinaryOperator &I, // %Z = mul nsw i16 %X, C+1 // // iff C+1 isn't INT_MIN + const APInt *CInt; if (match(V, m_APInt(CInt))) { if (!CInt->isMinSignedValue()) BO->setHasNoSignedWrap(HasNSW); @@ -755,31 +774,52 @@ Value *InstCombiner::SimplifyUsingDistributiveLaws(BinaryOperator &I) { Value *InstCombiner::SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS, Value *RHS) { + Value *A, *B, *C, *D, *E, *F; + bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))); + bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F))); + if (!LHSIsSelect && !RHSIsSelect) + return nullptr; + + FastMathFlags FMF; + BuilderTy::FastMathFlagGuard Guard(Builder); + if (isa<FPMathOperator>(&I)) { + FMF = I.getFastMathFlags(); + Builder.setFastMathFlags(FMF); + } + Instruction::BinaryOps Opcode = I.getOpcode(); - // (op (select (a, b, c)), (select (a, d, e))) -> (select (a, (op b, d), (op - // c, e))) - Value *A, *B, *C, *D, *E; - Value *SI = nullptr; - if (match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C))) && - match(RHS, m_Select(m_Specific(A), m_Value(D), m_Value(E)))) { - bool SelectsHaveOneUse = LHS->hasOneUse() && RHS->hasOneUse(); - BuilderTy::FastMathFlagGuard Guard(Builder); - if (isa<FPMathOperator>(&I)) - Builder.setFastMathFlags(I.getFastMathFlags()); - - Value *V1 = SimplifyBinOp(Opcode, C, E, SQ.getWithInstruction(&I)); - Value *V2 = SimplifyBinOp(Opcode, B, D, SQ.getWithInstruction(&I)); - if (V1 && V2) - SI = Builder.CreateSelect(A, V2, V1); - else if (V2 && SelectsHaveOneUse) - SI = Builder.CreateSelect(A, V2, Builder.CreateBinOp(Opcode, C, E)); - else if (V1 && SelectsHaveOneUse) - SI = Builder.CreateSelect(A, Builder.CreateBinOp(Opcode, B, D), V1); - - if (SI) - SI->takeName(&I); + SimplifyQuery Q = SQ.getWithInstruction(&I); + + Value *Cond, *True = nullptr, *False = nullptr; + if (LHSIsSelect && RHSIsSelect && A == D) { + // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F) + Cond = A; + True = SimplifyBinOp(Opcode, B, E, FMF, Q); + False = SimplifyBinOp(Opcode, C, F, FMF, Q); + + if (LHS->hasOneUse() && RHS->hasOneUse()) { + if (False && !True) + True = Builder.CreateBinOp(Opcode, B, E); + else if (True && !False) + False = Builder.CreateBinOp(Opcode, C, F); + } + } else if (LHSIsSelect && LHS->hasOneUse()) { + // (A ? B : C) op Y -> A ? (B op Y) : (C op Y) + Cond = A; + True = SimplifyBinOp(Opcode, B, RHS, FMF, Q); + False = SimplifyBinOp(Opcode, C, RHS, FMF, Q); + } else if (RHSIsSelect && RHS->hasOneUse()) { + // X op (D ? E : F) -> D ? (X op E) : (X op F) + Cond = D; + True = SimplifyBinOp(Opcode, LHS, E, FMF, Q); + False = SimplifyBinOp(Opcode, LHS, F, FMF, Q); } + if (!True || !False) + return nullptr; + + Value *SI = Builder.CreateSelect(Cond, True, False); + SI->takeName(&I); return SI; } @@ -1518,11 +1558,13 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) { // If this is a widening shuffle, we must be able to extend with undef // elements. If the original binop does not produce an undef in the high // lanes, then this transform is not safe. + // Similarly for undef lanes due to the shuffle mask, we can only + // transform binops that preserve undef. // TODO: We could shuffle those non-undef constant values into the // result by using a constant vector (rather than an undef vector) // as operand 1 of the new binop, but that might be too aggressive // for target-independent shuffle creation. - if (I >= SrcVecNumElts) { + if (I >= SrcVecNumElts || ShMask[I] < 0) { Constant *MaybeUndef = ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt) : ConstantExpr::get(Opcode, CElt, UndefScalar); @@ -1607,6 +1649,15 @@ Instruction *InstCombiner::narrowMathIfNoOverflow(BinaryOperator &BO) { return CastInst::Create(CastOpc, NarrowBO, BO.getType()); } +static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) { + // At least one GEP must be inbounds. + if (!GEP1.isInBounds() && !GEP2.isInBounds()) + return false; + + return (GEP1.isInBounds() || GEP1.hasAllZeroIndices()) && + (GEP2.isInBounds() || GEP2.hasAllZeroIndices()); +} + Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end()); Type *GEPType = GEP.getType(); @@ -1659,7 +1710,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // to an index of zero, so replace it with zero if it is not zero already. Type *EltTy = GTI.getIndexedType(); if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0) - if (!isa<Constant>(*I) || !cast<Constant>(*I)->isNullValue()) { + if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) { *I = Constant::getNullValue(NewIndexType); MadeChange = true; } @@ -1716,8 +1767,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // The first two arguments can vary for any GEP, the rest have to be // static for struct slots - if (J > 1 && CurTy->isStructTy()) - return nullptr; + if (J > 1) { + assert(CurTy && "No current type?"); + if (CurTy->isStructTy()) + return nullptr; + } DI = J; } else { @@ -1877,6 +1931,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // Update the GEP in place if possible. if (Src->getNumOperands() == 2) { + GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))); GEP.setOperand(0, Src->getOperand(0)); GEP.setOperand(1, Sum); return &GEP; @@ -1893,7 +1948,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } if (!Indices.empty()) - return GEP.isInBounds() && Src->isInBounds() + return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)) ? GetElementPtrInst::CreateInBounds( Src->getSourceElementType(), Src->getOperand(0), Indices, GEP.getName()) @@ -2146,15 +2201,17 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // of a bitcasted pointer to vector or array of the same dimensions: // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z - auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy) { + auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy, + const DataLayout &DL) { return ArrTy->getArrayElementType() == VecTy->getVectorElementType() && - ArrTy->getArrayNumElements() == VecTy->getVectorNumElements(); + ArrTy->getArrayNumElements() == VecTy->getVectorNumElements() && + DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy); }; if (GEP.getNumOperands() == 3 && ((GEPEltType->isArrayTy() && SrcEltType->isVectorTy() && - areMatchingArrayAndVecTypes(GEPEltType, SrcEltType)) || + areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) || (GEPEltType->isVectorTy() && SrcEltType->isArrayTy() && - areMatchingArrayAndVecTypes(SrcEltType, GEPEltType)))) { + areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) { // Create a new GEP here, as using `setOperand()` followed by // `setSourceElementType()` won't actually update the type of the @@ -2393,12 +2450,13 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) { replaceInstUsesWith(*C, ConstantInt::get(Type::getInt1Ty(C->getContext()), C->isFalseWhenEqual())); - } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) || - isa<AddrSpaceCastInst>(I)) { - replaceInstUsesWith(*I, UndefValue::get(I->getType())); } else if (auto *SI = dyn_cast<StoreInst>(I)) { for (auto *DII : DIIs) ConvertDebugDeclareToDebugValue(DII, SI, *DIB); + } else { + // Casts, GEP, or anything else: we're about to delete this instruction, + // so it can not have any valid uses. + replaceInstUsesWith(*I, UndefValue::get(I->getType())); } eraseInstFromFunction(*I); } @@ -2549,9 +2607,7 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { // Change br (not X), label True, label False to: br X, label False, True Value *X = nullptr; - BasicBlock *TrueDest; - BasicBlock *FalseDest; - if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) && + if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) && !isa<Constant>(X)) { // Swap Destinations and condition... BI.setCondition(X); @@ -2569,8 +2625,8 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq. CmpInst::Predicate Pred; - if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), TrueDest, - FalseDest)) && + if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())), + m_BasicBlock(), m_BasicBlock())) && !isCanonicalPredicate(Pred)) { // Swap destinations and condition. CmpInst *Cond = cast<CmpInst>(BI.getCondition()); @@ -3105,6 +3161,15 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) { return nullptr; } +Instruction *InstCombiner::visitFreeze(FreezeInst &I) { + Value *Op0 = I.getOperand(0); + + if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I))) + return replaceInstUsesWith(I, V); + + return nullptr; +} + /// Try to move the specified instruction from its current block into the /// beginning of DestBlock, which can only happen if it's safe to move the /// instruction past all of the instructions between it and the end of its @@ -3156,6 +3221,21 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { findDbgUsers(DbgUsers, I); for (auto *DII : reverse(DbgUsers)) { if (DII->getParent() == SrcBlock) { + if (isa<DbgDeclareInst>(DII)) { + // A dbg.declare instruction should not be cloned, since there can only be + // one per variable fragment. It should be left in the original place since + // sunk instruction is not an alloca(otherwise we could not be here). + // But we need to update arguments of dbg.declare instruction, so that it + // would not point into sunk instruction. + if (!isa<CastInst>(I)) + continue; // dbg.declare points at something it shouldn't + + DII->setOperand( + 0, MetadataAsValue::get(I->getContext(), + ValueAsMetadata::get(I->getOperand(0)))); + continue; + } + // dbg.value is in the same basic block as the sunk inst, see if we can // salvage it. Clone a new copy of the instruction: on success we need // both salvaged and unsalvaged copies. @@ -3301,10 +3381,6 @@ bool InstCombiner::run() { // Move the name to the new instruction first. Result->takeName(I); - // Push the new instruction and any users onto the worklist. - Worklist.AddUsersToWorkList(*Result); - Worklist.Add(Result); - // Insert the new instruction into the basic block... BasicBlock *InstParent = I->getParent(); BasicBlock::iterator InsertPos = I->getIterator(); @@ -3316,6 +3392,10 @@ bool InstCombiner::run() { InstParent->getInstList().insert(InsertPos, Result); + // Push the new instruction and any users onto the worklist. + Worklist.AddUsersToWorkList(*Result); + Worklist.Add(Result); + eraseInstFromFunction(*I); } else { LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n' @@ -3371,8 +3451,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL, if (isInstructionTriviallyDead(Inst, TLI)) { ++NumDeadInst; LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n'); - if (!salvageDebugInfo(*Inst)) - replaceDbgUsesWithUndef(Inst); + salvageDebugInfoOrMarkUndef(*Inst); Inst->eraseFromParent(); MadeIRChange = true; continue; @@ -3486,10 +3565,12 @@ static bool combineInstructionsOverFunction( Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, bool ExpensiveCombines = true, - LoopInfo *LI = nullptr) { + ProfileSummaryInfo *PSI, bool ExpensiveCombines, unsigned MaxIterations, + LoopInfo *LI) { auto &DL = F.getParent()->getDataLayout(); - ExpensiveCombines |= EnableExpensiveCombines; + if (EnableExpensiveCombines.getNumOccurrences()) + ExpensiveCombines = EnableExpensiveCombines; + MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue()); /// Builder - This is an IRBuilder that automatically inserts new /// instructions into the worklist when they are created. @@ -3508,9 +3589,23 @@ static bool combineInstructionsOverFunction( MadeIRChange = LowerDbgDeclare(F); // Iterate while there is work to do. - int Iteration = 0; + unsigned Iteration = 0; while (true) { ++Iteration; + + if (Iteration > InfiniteLoopDetectionThreshold) { + report_fatal_error( + "Instruction Combining seems stuck in an infinite loop after " + + Twine(InfiniteLoopDetectionThreshold) + " iterations."); + } + + if (Iteration > MaxIterations) { + LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations + << " on " << F.getName() + << " reached; stopping before reaching a fixpoint\n"); + break; + } + LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); @@ -3522,11 +3617,19 @@ static bool combineInstructionsOverFunction( if (!IC.run()) break; + + MadeIRChange = true; } - return MadeIRChange || Iteration > 1; + return MadeIRChange; } +InstCombinePass::InstCombinePass(bool ExpensiveCombines) + : ExpensiveCombines(ExpensiveCombines), MaxIterations(LimitMaxIterations) {} + +InstCombinePass::InstCombinePass(bool ExpensiveCombines, unsigned MaxIterations) + : ExpensiveCombines(ExpensiveCombines), MaxIterations(MaxIterations) {} + PreservedAnalyses InstCombinePass::run(Function &F, FunctionAnalysisManager &AM) { auto &AC = AM.getResult<AssumptionAnalysis>(F); @@ -3544,8 +3647,9 @@ PreservedAnalyses InstCombinePass::run(Function &F, auto *BFI = (PSI && PSI->hasProfileSummary()) ? &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr; - if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, - BFI, PSI, ExpensiveCombines, LI)) + if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI, + PSI, ExpensiveCombines, MaxIterations, + LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3580,7 +3684,7 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { // Required analyses. auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); @@ -3594,12 +3698,26 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() : nullptr; - return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, - BFI, PSI, ExpensiveCombines, LI); + return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI, + PSI, ExpensiveCombines, MaxIterations, + LI); } char InstructionCombiningPass::ID = 0; +InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines) + : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines), + MaxIterations(InstCombineDefaultMaxIterations) { + initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); +} + +InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines, + unsigned MaxIterations) + : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines), + MaxIterations(MaxIterations) { + initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); +} + INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine", "Combine redundant instructions", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) @@ -3626,6 +3744,11 @@ FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) { return new InstructionCombiningPass(ExpensiveCombines); } +FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines, + unsigned MaxIterations) { + return new InstructionCombiningPass(ExpensiveCombines, MaxIterations); +} + void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createInstructionCombiningPass()); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 89a90eab8978..79c119489a65 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -59,6 +59,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -129,6 +130,8 @@ static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E; static const char *const kAsanModuleCtorName = "asan.module_ctor"; static const char *const kAsanModuleDtorName = "asan.module_dtor"; static const uint64_t kAsanCtorAndDtorPriority = 1; +// On Emscripten, the system needs more than one priorities for constructors. +static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50; static const char *const kAsanReportErrorTemplate = "__asan_report_"; static const char *const kAsanRegisterGlobalsName = "__asan_register_globals"; static const char *const kAsanUnregisterGlobalsName = @@ -191,6 +194,11 @@ static cl::opt<bool> ClRecover( cl::desc("Enable recovery mode (continue-after-error)."), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClInsertVersionCheck( + "asan-guard-against-version-mismatch", + cl::desc("Guard against compiler/runtime version mismatch."), + cl::Hidden, cl::init(true)); + // This flag may need to be replaced with -f[no-]asan-reads. static cl::opt<bool> ClInstrumentReads("asan-instrument-reads", cl::desc("instrument read instructions"), @@ -530,6 +538,14 @@ static size_t RedzoneSizeForScale(int MappingScale) { return std::max(32U, 1U << MappingScale); } +static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) { + if (TargetTriple.isOSEmscripten()) { + return kAsanEmscriptenCtorAndDtorPriority; + } else { + return kAsanCtorAndDtorPriority; + } +} + namespace { /// Module analysis for getting various metadata about the module. @@ -565,10 +581,10 @@ char ASanGlobalsMetadataWrapperPass::ID = 0; /// AddressSanitizer: instrument the code in module to find memory bugs. struct AddressSanitizer { - AddressSanitizer(Module &M, GlobalsMetadata &GlobalsMD, + AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD, bool CompileKernel = false, bool Recover = false, bool UseAfterScope = false) - : UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(GlobalsMD) { + : UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) { this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover; this->CompileKernel = ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel; @@ -706,8 +722,8 @@ public: GlobalsMetadata &GlobalsMD = getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD(); const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - AddressSanitizer ASan(*F.getParent(), GlobalsMD, CompileKernel, Recover, + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover, UseAfterScope); return ASan.instrumentFunction(F, TLI); } @@ -720,10 +736,10 @@ private: class ModuleAddressSanitizer { public: - ModuleAddressSanitizer(Module &M, GlobalsMetadata &GlobalsMD, + ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD, bool CompileKernel = false, bool Recover = false, bool UseGlobalsGC = true, bool UseOdrIndicator = false) - : GlobalsMD(GlobalsMD), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), + : GlobalsMD(*GlobalsMD), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), // Enable aliases as they should have no downside with ODR indicators. UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias), UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator), @@ -830,7 +846,7 @@ public: bool runOnModule(Module &M) override { GlobalsMetadata &GlobalsMD = getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD(); - ModuleAddressSanitizer ASanModule(M, GlobalsMD, CompileKernel, Recover, + ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover, UseGlobalGC, UseOdrIndicator); return ASanModule.instrumentModule(M); } @@ -1033,7 +1049,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> { if (!II.isLifetimeStartOrEnd()) return; // Found lifetime intrinsic, add ASan instrumentation if necessary. - ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0)); + auto *Size = cast<ConstantInt>(II.getArgOperand(0)); // If size argument is undefined, don't do anything. if (Size->isMinusOne()) return; // Check that size doesn't saturate uint64_t and can @@ -1156,7 +1172,7 @@ PreservedAnalyses AddressSanitizerPass::run(Function &F, Module &M = *F.getParent(); if (auto *R = MAM.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) { const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F); - AddressSanitizer Sanitizer(M, *R, CompileKernel, Recover, UseAfterScope); + AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope); if (Sanitizer.instrumentFunction(F, TLI)) return PreservedAnalyses::none(); return PreservedAnalyses::all(); @@ -1178,7 +1194,7 @@ ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(bool CompileKernel, PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M, AnalysisManager<Module> &AM) { GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M); - ModuleAddressSanitizer Sanitizer(M, GlobalsMD, CompileKernel, Recover, + ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover, UseGlobalGC, UseOdrIndicator); if (Sanitizer.instrumentModule(M)) return PreservedAnalyses::none(); @@ -1331,7 +1347,7 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I, unsigned *Alignment, Value **MaybeMask) { // Skip memory accesses inserted by another instrumentation. - if (I->getMetadata("nosanitize")) return nullptr; + if (I->hasMetadata("nosanitize")) return nullptr; // Do not instrument the load fetching the dynamic shadow address. if (LocalDynamicShadow == I) @@ -1775,9 +1791,10 @@ void ModuleAddressSanitizer::createInitializerPoisonCalls( // Must have a function or null ptr. if (Function *F = dyn_cast<Function>(CS->getOperand(1))) { if (F->getName() == kAsanModuleCtorName) continue; - ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0)); + auto *Priority = cast<ConstantInt>(CS->getOperand(0)); // Don't instrument CTORs that will run before asan.module_ctor. - if (Priority->getLimitedValue() <= kAsanCtorAndDtorPriority) continue; + if (Priority->getLimitedValue() <= GetCtorAndDtorPriority(TargetTriple)) + continue; poisonOneInitializer(*F, ModuleName); } } @@ -1792,6 +1809,8 @@ bool ModuleAddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) { if (GlobalsMD.get(G).IsBlacklisted) return false; if (!Ty->isSized()) return false; if (!G->hasInitializer()) return false; + // Only instrument globals of default address spaces + if (G->getAddressSpace()) return false; if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals. // Two problems with thread-locals: // - The address of the main thread's copy can't be computed at link-time. @@ -1919,7 +1938,12 @@ StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const { case Triple::COFF: return ".ASAN$GL"; case Triple::ELF: return "asan_globals"; case Triple::MachO: return "__DATA,__asan_globals,regular"; - default: break; + case Triple::Wasm: + case Triple::XCOFF: + report_fatal_error( + "ModuleAddressSanitizer not implemented for object file format."); + case Triple::UnknownObjectFormat: + break; } llvm_unreachable("unsupported object format"); } @@ -2033,7 +2057,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsCOFF( unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType()); assert(isPowerOf2_32(SizeOfGlobalStruct) && "global metadata will not be padded appropriately"); - Metadata->setAlignment(SizeOfGlobalStruct); + Metadata->setAlignment(assumeAligned(SizeOfGlobalStruct)); SetComdatForGlobalMetadata(G, Metadata, ""); } @@ -2170,7 +2194,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray( M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage, ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), ""); if (Mapping.Scale > 3) - AllGlobals->setAlignment(1ULL << Mapping.Scale); + AllGlobals->setAlignment(Align(1ULL << Mapping.Scale)); IRB.CreateCall(AsanRegisterGlobals, {IRB.CreatePointerCast(AllGlobals, IntptrTy), @@ -2270,7 +2294,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M, "", G, G->getThreadLocalMode()); NewGlobal->copyAttributesFrom(G); NewGlobal->setComdat(G->getComdat()); - NewGlobal->setAlignment(MinRZ); + NewGlobal->setAlignment(MaybeAlign(MinRZ)); // Don't fold globals with redzones. ODR violation detector and redzone // poisoning implicitly creates a dependence on the global's address, so it // is no longer valid for it to be marked unnamed_addr. @@ -2338,7 +2362,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M, // Set meaningful attributes for indicator symbol. ODRIndicatorSym->setVisibility(NewGlobal->getVisibility()); ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass()); - ODRIndicatorSym->setAlignment(1); + ODRIndicatorSym->setAlignment(Align::None()); ODRIndicator = ODRIndicatorSym; } @@ -2410,39 +2434,39 @@ bool ModuleAddressSanitizer::instrumentModule(Module &M) { // Create a module constructor. A destructor is created lazily because not all // platforms, and not all modules need it. + std::string AsanVersion = std::to_string(GetAsanVersion(M)); std::string VersionCheckName = - kAsanVersionCheckNamePrefix + std::to_string(GetAsanVersion(M)); + ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : ""; std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions( M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{}, /*InitArgs=*/{}, VersionCheckName); bool CtorComdat = true; - bool Changed = false; // TODO(glider): temporarily disabled globals instrumentation for KASan. if (ClGlobals) { IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator()); - Changed |= InstrumentGlobals(IRB, M, &CtorComdat); + InstrumentGlobals(IRB, M, &CtorComdat); } + const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple); + // Put the constructor and destructor in comdat if both // (1) global instrumentation is not TU-specific // (2) target is ELF. if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) { AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName)); - appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority, - AsanCtorFunction); + appendToGlobalCtors(M, AsanCtorFunction, Priority, AsanCtorFunction); if (AsanDtorFunction) { AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName)); - appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority, - AsanDtorFunction); + appendToGlobalDtors(M, AsanDtorFunction, Priority, AsanDtorFunction); } } else { - appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority); + appendToGlobalCtors(M, AsanCtorFunction, Priority); if (AsanDtorFunction) - appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority); + appendToGlobalDtors(M, AsanDtorFunction, Priority); } - return Changed; + return true; } void AddressSanitizer::initializeCallbacks(Module &M) { @@ -2664,7 +2688,7 @@ bool AddressSanitizer::instrumentFunction(Function &F, if (CS) { // A call inside BB. TempsToInstrument.clear(); - if (CS.doesNotReturn() && !CS->getMetadata("nosanitize")) + if (CS.doesNotReturn() && !CS->hasMetadata("nosanitize")) NoReturnCalls.push_back(CS.getInstruction()); } if (CallInst *CI = dyn_cast<CallInst>(&Inst)) @@ -2877,18 +2901,18 @@ void FunctionStackPoisoner::copyArgsPassedByValToAllocas() { for (Argument &Arg : F.args()) { if (Arg.hasByValAttr()) { Type *Ty = Arg.getType()->getPointerElementType(); - unsigned Align = Arg.getParamAlignment(); - if (Align == 0) Align = DL.getABITypeAlignment(Ty); + const Align Alignment = + DL.getValueOrABITypeAlignment(Arg.getParamAlign(), Ty); AllocaInst *AI = IRB.CreateAlloca( Ty, nullptr, (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) + ".byval"); - AI->setAlignment(Align); + AI->setAlignment(Alignment); Arg.replaceAllUsesWith(AI); uint64_t AllocSize = DL.getTypeAllocSize(Ty); - IRB.CreateMemCpy(AI, Align, &Arg, Align, AllocSize); + IRB.CreateMemCpy(AI, Alignment, &Arg, Alignment, AllocSize); } } } @@ -2919,7 +2943,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout( } assert((ClRealignStack & (ClRealignStack - 1)) == 0); size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack); - Alloca->setAlignment(FrameAlignment); + Alloca->setAlignment(MaybeAlign(FrameAlignment)); return IRB.CreatePointerCast(Alloca, IntptrTy); } @@ -2928,7 +2952,7 @@ void FunctionStackPoisoner::createDynamicAllocasInitStorage() { IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin())); DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr); IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout); - DynamicAllocaLayout->setAlignment(32); + DynamicAllocaLayout->setAlignment(Align(32)); } void FunctionStackPoisoner::processDynamicAllocas() { @@ -2971,7 +2995,6 @@ void FunctionStackPoisoner::processStaticAllocas() { Instruction *InsBefore = AllocaVec[0]; IRBuilder<> IRB(InsBefore); - IRB.SetCurrentDebugLocation(EntryDebugLocation); // Make sure non-instrumented allocas stay in the entry block. Otherwise, // debug info is broken, because only entry-block allocas are treated as @@ -3066,14 +3089,12 @@ void FunctionStackPoisoner::processStaticAllocas() { Instruction *Term = SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false); IRBuilder<> IRBIf(Term); - IRBIf.SetCurrentDebugLocation(EntryDebugLocation); StackMallocIdx = StackMallocSizeClass(LocalStackSize); assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass); Value *FakeStackValue = IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx], ConstantInt::get(IntptrTy, LocalStackSize)); IRB.SetInsertPoint(InsBefore); - IRB.SetCurrentDebugLocation(EntryDebugLocation); FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term, ConstantInt::get(IntptrTy, 0)); @@ -3081,14 +3102,11 @@ void FunctionStackPoisoner::processStaticAllocas() { IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy)); Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false); IRBIf.SetInsertPoint(Term); - IRBIf.SetCurrentDebugLocation(EntryDebugLocation); Value *AllocaValue = DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca; IRB.SetInsertPoint(InsBefore); - IRB.SetCurrentDebugLocation(EntryDebugLocation); LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack); - IRB.SetCurrentDebugLocation(EntryDebugLocation); IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca); DIExprFlags |= DIExpression::DerefBefore; } else { @@ -3275,7 +3293,7 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) { // Insert new alloca with new NewSize and Align params. AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize); - NewAlloca->setAlignment(Align); + NewAlloca->setAlignment(MaybeAlign(Align)); // NewAddress = Address + Align Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy), diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 4dc9b611c156..9abb62ac788c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -224,7 +225,7 @@ struct BoundsCheckingLegacyPass : public FunctionPass { } bool runOnFunction(Function &F) override { - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); return addBoundsChecking(F, TLI, SE); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h index 971e00041762..8bb6f47c4846 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/CFGMST.h @@ -257,13 +257,13 @@ public: std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr)); if (Inserted) { // Newly inserted, update the real info. - Iter->second = std::move(llvm::make_unique<BBInfo>(Index)); + Iter->second = std::move(std::make_unique<BBInfo>(Index)); Index++; } std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr)); if (Inserted) // Newly inserted, update the real info. - Iter->second = std::move(llvm::make_unique<BBInfo>(Index)); + Iter->second = std::move(std::make_unique<BBInfo>(Index)); AllEdges.emplace_back(new Edge(Src, Dest, W)); return *AllEdges.back(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 3f4f9bc7145d..d35abb92dd08 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -27,7 +27,9 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -512,30 +514,38 @@ static bool isHoistable(Instruction *I, DominatorTree &DT) { // first-region entry block) or the (hoistable or unhoistable) base values that // are defined outside (including the first-region entry block) of the // scope. The returned set doesn't include constants. -static std::set<Value *> getBaseValues(Value *V, - DominatorTree &DT) { +static std::set<Value *> getBaseValues( + Value *V, DominatorTree &DT, + DenseMap<Value *, std::set<Value *>> &Visited) { + if (Visited.count(V)) { + return Visited[V]; + } std::set<Value *> Result; if (auto *I = dyn_cast<Instruction>(V)) { // We don't stop at a block that's not in the Scope because we would miss some // instructions that are based on the same base values if we stop there. if (!isHoistable(I, DT)) { Result.insert(I); + Visited.insert(std::make_pair(V, Result)); return Result; } // I is hoistable above the Scope. for (Value *Op : I->operands()) { - std::set<Value *> OpResult = getBaseValues(Op, DT); + std::set<Value *> OpResult = getBaseValues(Op, DT, Visited); Result.insert(OpResult.begin(), OpResult.end()); } + Visited.insert(std::make_pair(V, Result)); return Result; } if (isa<Argument>(V)) { Result.insert(V); + Visited.insert(std::make_pair(V, Result)); return Result; } // We don't include others like constants because those won't lead to any // chance of folding of conditions (eg two bit checks merged into one check) // after CHR. + Visited.insert(std::make_pair(V, Result)); return Result; // empty } @@ -615,6 +625,10 @@ static bool checkMDProf(MDNode *MD, BranchProbability &TrueProb, assert(SumWt >= TrueWt && SumWt >= FalseWt && "Overflow calculating branch probabilities."); + // Guard against 0-to-0 branch weights to avoid a division-by-zero crash. + if (SumWt == 0) + return false; + TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt); FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt); return true; @@ -1053,6 +1067,7 @@ static bool shouldSplit(Instruction *InsertPoint, DenseSet<Value *> &ConditionValues, DominatorTree &DT, DenseSet<Instruction *> &Unhoistables) { + assert(InsertPoint && "Null InsertPoint"); CHR_DEBUG( dbgs() << "shouldSplit " << *InsertPoint << " PrevConditionValues "; for (Value *V : PrevConditionValues) { @@ -1063,7 +1078,6 @@ static bool shouldSplit(Instruction *InsertPoint, dbgs() << *V << ", "; } dbgs() << "\n"); - assert(InsertPoint && "Null InsertPoint"); // If any of Bases isn't hoistable to the hoist point, split. for (Value *V : ConditionValues) { DenseMap<Instruction *, bool> Visited; @@ -1078,12 +1092,13 @@ static bool shouldSplit(Instruction *InsertPoint, if (!PrevConditionValues.empty() && !ConditionValues.empty()) { // Use std::set as DenseSet doesn't work with set_intersection. std::set<Value *> PrevBases, Bases; + DenseMap<Value *, std::set<Value *>> Visited; for (Value *V : PrevConditionValues) { - std::set<Value *> BaseValues = getBaseValues(V, DT); + std::set<Value *> BaseValues = getBaseValues(V, DT, Visited); PrevBases.insert(BaseValues.begin(), BaseValues.end()); } for (Value *V : ConditionValues) { - std::set<Value *> BaseValues = getBaseValues(V, DT); + std::set<Value *> BaseValues = getBaseValues(V, DT, Visited); Bases.insert(BaseValues.begin(), BaseValues.end()); } CHR_DEBUG( @@ -1538,10 +1553,7 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp, } if (auto *SI = dyn_cast<SelectInst>(U)) { // Swap operands - Value *TrueValue = SI->getTrueValue(); - Value *FalseValue = SI->getFalseValue(); - SI->setTrueValue(FalseValue); - SI->setFalseValue(TrueValue); + SI->swapValues(); SI->swapProfMetadata(); if (Scope->TrueBiasedSelects.count(SI)) { assert(Scope->FalseBiasedSelects.count(SI) == 0 && @@ -2073,7 +2085,7 @@ bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) { getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo(); std::unique_ptr<OptimizationRemarkEmitter> OwnedORE = - llvm::make_unique<OptimizationRemarkEmitter>(&F); + std::make_unique<OptimizationRemarkEmitter>(&F); return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 2279c1bcb6a8..cf9a6a321c7a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -55,7 +55,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" @@ -83,13 +82,16 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/SpecialCaseList.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -479,7 +481,9 @@ DataFlowSanitizer::DataFlowSanitizer( std::vector<std::string> AllABIListFiles(std::move(ABIListFiles)); AllABIListFiles.insert(AllABIListFiles.end(), ClABIListFiles.begin(), ClABIListFiles.end()); - ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles)); + // FIXME: should we propagate vfs::FileSystem to this constructor? + ABIList.set( + SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem())); } FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { @@ -1212,7 +1216,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align, return DFS.ZeroShadow; case 1: { LoadInst *LI = new LoadInst(DFS.ShadowTy, ShadowAddr, "", Pos); - LI->setAlignment(ShadowAlign); + LI->setAlignment(MaybeAlign(ShadowAlign)); return LI; } case 2: { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 59950ffc4e9a..bf3e4ed3e31f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -86,7 +87,9 @@ public: ReversedVersion[3] = Options.Version[0]; ReversedVersion[4] = '\0'; } - bool runOnModule(Module &M, const TargetLibraryInfo &TLI); + bool + runOnModule(Module &M, + std::function<const TargetLibraryInfo &(Function &F)> GetTLI); private: // Create the .gcno files for the Module based on DebugInfo. @@ -102,9 +105,9 @@ private: std::vector<Regex> &Regexes); // Get pointers to the functions in the runtime library. - FunctionCallee getStartFileFunc(); - FunctionCallee getEmitFunctionFunc(); - FunctionCallee getEmitArcsFunc(); + FunctionCallee getStartFileFunc(const TargetLibraryInfo *TLI); + FunctionCallee getEmitFunctionFunc(const TargetLibraryInfo *TLI); + FunctionCallee getEmitArcsFunc(const TargetLibraryInfo *TLI); FunctionCallee getSummaryInfoFunc(); FunctionCallee getEndFileFunc(); @@ -126,9 +129,9 @@ private: // Checksum, produced by hash of EdgeDestinations SmallVector<uint32_t, 4> FileChecksums; - Module *M; - const TargetLibraryInfo *TLI; - LLVMContext *Ctx; + Module *M = nullptr; + std::function<const TargetLibraryInfo &(Function &F)> GetTLI; + LLVMContext *Ctx = nullptr; SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs; std::vector<Regex> FilterRe; std::vector<Regex> ExcludeRe; @@ -147,8 +150,9 @@ public: StringRef getPassName() const override { return "GCOV Profiler"; } bool runOnModule(Module &M) override { - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); - return Profiler.runOnModule(M, TLI); + return Profiler.runOnModule(M, [this](Function &F) -> TargetLibraryInfo & { + return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -381,7 +385,7 @@ namespace { return EdgeDestinations; } - uint32_t getFuncChecksum() { + uint32_t getFuncChecksum() const { return FuncChecksum; } @@ -555,9 +559,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU, return CurPath.str(); } -bool GCOVProfiler::runOnModule(Module &M, const TargetLibraryInfo &TLI) { +bool GCOVProfiler::runOnModule( + Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) { this->M = &M; - this->TLI = &TLI; + this->GetTLI = std::move(GetTLI); Ctx = &M.getContext(); AddFlushBeforeForkAndExec(); @@ -574,9 +579,12 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M, ModuleAnalysisManager &AM) { GCOVProfiler Profiler(GCOVOpts); + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); - auto &TLI = AM.getResult<TargetLibraryAnalysis>(M); - if (!Profiler.runOnModule(M, TLI)) + if (!Profiler.runOnModule(M, [&](Function &F) -> TargetLibraryInfo & { + return FAM.getResult<TargetLibraryAnalysis>(F); + })) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -624,6 +632,7 @@ static bool shouldKeepInEntry(BasicBlock::iterator It) { void GCOVProfiler::AddFlushBeforeForkAndExec() { SmallVector<Instruction *, 2> ForkAndExecs; for (auto &F : M->functions()) { + auto *TLI = &GetTLI(F); for (auto &I : instructions(F)) { if (CallInst *CI = dyn_cast<CallInst>(&I)) { if (Function *Callee = CI->getCalledFunction()) { @@ -669,7 +678,8 @@ void GCOVProfiler::emitProfileNotes() { continue; std::error_code EC; - raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC, sys::fs::F_None); + raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC, + sys::fs::OF_None); if (EC) { Ctx->emitError(Twine("failed to open coverage notes file for writing: ") + EC.message()); @@ -695,7 +705,7 @@ void GCOVProfiler::emitProfileNotes() { ++It; EntryBlock.splitBasicBlock(It); - Funcs.push_back(make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++, + Funcs.push_back(std::make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++, Options.UseCfgChecksum, Options.ExitBlockBeforeBody)); GCOVFunction &Func = *Funcs.back(); @@ -704,7 +714,10 @@ void GCOVProfiler::emitProfileNotes() { // to have a counter for the function definition. uint32_t Line = SP->getLine(); auto Filename = getFilename(SP); - Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line); + + // Artificial functions such as global initializers + if (!SP->isArtificial()) + Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line); for (auto &BB : F) { GCOVBlock &Block = Func.getBlock(&BB); @@ -873,7 +886,7 @@ bool GCOVProfiler::emitProfileArcs() { return Result; } -FunctionCallee GCOVProfiler::getStartFileFunc() { +FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) { Type *Args[] = { Type::getInt8PtrTy(*Ctx), // const char *orig_filename Type::getInt8PtrTy(*Ctx), // const char version[4] @@ -887,7 +900,7 @@ FunctionCallee GCOVProfiler::getStartFileFunc() { return Res; } -FunctionCallee GCOVProfiler::getEmitFunctionFunc() { +FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) { Type *Args[] = { Type::getInt32Ty(*Ctx), // uint32_t ident Type::getInt8PtrTy(*Ctx), // const char *function_name @@ -906,7 +919,7 @@ FunctionCallee GCOVProfiler::getEmitFunctionFunc() { return M->getOrInsertFunction("llvm_gcda_emit_function", FTy); } -FunctionCallee GCOVProfiler::getEmitArcsFunc() { +FunctionCallee GCOVProfiler::getEmitArcsFunc(const TargetLibraryInfo *TLI) { Type *Args[] = { Type::getInt32Ty(*Ctx), // uint32_t num_counters Type::getInt64PtrTy(*Ctx), // uint64_t *counters @@ -943,9 +956,11 @@ Function *GCOVProfiler::insertCounterWriteout( BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF); IRBuilder<> Builder(BB); - FunctionCallee StartFile = getStartFileFunc(); - FunctionCallee EmitFunction = getEmitFunctionFunc(); - FunctionCallee EmitArcs = getEmitArcsFunc(); + auto *TLI = &GetTLI(*WriteoutF); + + FunctionCallee StartFile = getStartFileFunc(TLI); + FunctionCallee EmitFunction = getEmitFunctionFunc(TLI); + FunctionCallee EmitArcs = getEmitArcsFunc(TLI); FunctionCallee SummaryInfo = getSummaryInfoFunc(); FunctionCallee EndFile = getEndFileFunc(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 90a9f4955a4b..7e8f8e27a97b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -12,10 +12,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -36,6 +38,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -52,7 +55,10 @@ using namespace llvm; #define DEBUG_TYPE "hwasan" static const char *const kHwasanModuleCtorName = "hwasan.module_ctor"; +static const char *const kHwasanNoteName = "hwasan.note"; static const char *const kHwasanInitName = "__hwasan_init"; +static const char *const kHwasanPersonalityThunkName = + "__hwasan_personality_thunk"; static const char *const kHwasanShadowMemoryDynamicAddress = "__hwasan_shadow_memory_dynamic_address"; @@ -112,6 +118,9 @@ static cl::opt<bool> ClGenerateTagsWithCalls( cl::desc("generate new tags with runtime library calls"), cl::Hidden, cl::init(false)); +static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"), + cl::Hidden, cl::init(false)); + static cl::opt<int> ClMatchAllTag( "hwasan-match-all-tag", cl::desc("don't report bad accesses via pointers with this tag"), @@ -155,8 +164,18 @@ static cl::opt<bool> static cl::opt<bool> ClInstrumentLandingPads("hwasan-instrument-landing-pads", - cl::desc("instrument landing pads"), cl::Hidden, - cl::init(true)); + cl::desc("instrument landing pads"), cl::Hidden, + cl::init(false), cl::ZeroOrMore); + +static cl::opt<bool> ClUseShortGranules( + "hwasan-use-short-granules", + cl::desc("use short granules in allocas and outlined checks"), cl::Hidden, + cl::init(false), cl::ZeroOrMore); + +static cl::opt<bool> ClInstrumentPersonalityFunctions( + "hwasan-instrument-personality-functions", + cl::desc("instrument personality functions"), cl::Hidden, cl::init(false), + cl::ZeroOrMore); static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks", cl::desc("inline all checks"), @@ -169,16 +188,16 @@ namespace { class HWAddressSanitizer { public: explicit HWAddressSanitizer(Module &M, bool CompileKernel = false, - bool Recover = false) { + bool Recover = false) : M(M) { this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover; this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ? ClEnableKhwasan : CompileKernel; - initializeModule(M); + initializeModule(); } bool sanitizeFunction(Function &F); - void initializeModule(Module &M); + void initializeModule(); void initializeCallbacks(Module &M); @@ -203,7 +222,7 @@ public: Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong); bool instrumentStack( SmallVectorImpl<AllocaInst *> &Allocas, - DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap, + DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap, SmallVectorImpl<Instruction *> &RetVec, Value *StackTag); Value *readRegister(IRBuilder<> &IRB, StringRef Name); bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec); @@ -216,9 +235,14 @@ public: Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty); void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord); + void instrumentGlobal(GlobalVariable *GV, uint8_t Tag); + void instrumentGlobals(); + + void instrumentPersonalityFunctions(); + private: LLVMContext *C; - std::string CurModuleUniqueId; + Module &M; Triple TargetTriple; FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset; FunctionCallee HWAsanHandleVfork; @@ -238,17 +262,21 @@ private: bool InTls; void init(Triple &TargetTriple); - unsigned getAllocaAlignment() const { return 1U << Scale; } + unsigned getObjectAlignment() const { return 1U << Scale; } }; ShadowMapping Mapping; + Type *VoidTy = Type::getVoidTy(M.getContext()); Type *IntptrTy; Type *Int8PtrTy; Type *Int8Ty; Type *Int32Ty; + Type *Int64Ty = Type::getInt64Ty(M.getContext()); bool CompileKernel; bool Recover; + bool UseShortGranules; + bool InstrumentLandingPads; Function *HwasanCtorFunction; @@ -257,7 +285,6 @@ private: FunctionCallee HwasanTagMemoryFunc; FunctionCallee HwasanGenerateTagFunc; - FunctionCallee HwasanThreadEnterFunc; Constant *ShadowGlobal; @@ -278,7 +305,7 @@ public: StringRef getPassName() const override { return "HWAddressSanitizer"; } bool doInitialization(Module &M) override { - HWASan = llvm::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover); + HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover); return true; } @@ -333,7 +360,7 @@ PreservedAnalyses HWAddressSanitizerPass::run(Module &M, /// Module-level initialization. /// /// inserts a call to __hwasan_init to the module's constructor list. -void HWAddressSanitizer::initializeModule(Module &M) { +void HWAddressSanitizer::initializeModule() { LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n"); auto &DL = M.getDataLayout(); @@ -342,7 +369,6 @@ void HWAddressSanitizer::initializeModule(Module &M) { Mapping.init(TargetTriple); C = &(M.getContext()); - CurModuleUniqueId = getUniqueModuleId(&M); IRBuilder<> IRB(*C); IntptrTy = IRB.getIntPtrTy(DL); Int8PtrTy = IRB.getInt8PtrTy(); @@ -350,6 +376,21 @@ void HWAddressSanitizer::initializeModule(Module &M) { Int32Ty = IRB.getInt32Ty(); HwasanCtorFunction = nullptr; + + // Older versions of Android do not have the required runtime support for + // short granules, global or personality function instrumentation. On other + // platforms we currently require using the latest version of the runtime. + bool NewRuntime = + !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30); + + UseShortGranules = + ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime; + + // If we don't have personality function support, fall back to landing pads. + InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences() + ? ClInstrumentLandingPads + : !NewRuntime; + if (!CompileKernel) { std::tie(HwasanCtorFunction, std::ignore) = getOrCreateSanitizerCtorAndInitFunctions( @@ -363,6 +404,18 @@ void HWAddressSanitizer::initializeModule(Module &M) { Ctor->setComdat(CtorComdat); appendToGlobalCtors(M, Ctor, 0, Ctor); }); + + bool InstrumentGlobals = + ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime; + if (InstrumentGlobals) + instrumentGlobals(); + + bool InstrumentPersonalityFunctions = + ClInstrumentPersonalityFunctions.getNumOccurrences() + ? ClInstrumentPersonalityFunctions + : NewRuntime; + if (InstrumentPersonalityFunctions) + instrumentPersonalityFunctions(); } if (!TargetTriple.isAndroid()) { @@ -420,9 +473,6 @@ void HWAddressSanitizer::initializeCallbacks(Module &M) { HWAsanHandleVfork = M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy); - - HwasanThreadEnterFunc = - M.getOrInsertFunction("__hwasan_thread_enter", IRB.getVoidTy()); } Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) { @@ -456,7 +506,7 @@ Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I, unsigned *Alignment, Value **MaybeMask) { // Skip memory accesses inserted by another instrumentation. - if (I->getMetadata("nosanitize")) return nullptr; + if (I->hasMetadata("nosanitize")) return nullptr; // Do not instrument the load fetching the dynamic shadow address. if (LocalDynamicShadow == I) @@ -564,9 +614,11 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite, TargetTriple.isOSBinFormatELF() && !Recover) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy); - IRB.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::hwasan_check_memaccess), - {shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)}); + IRB.CreateCall(Intrinsic::getDeclaration( + M, UseShortGranules + ? Intrinsic::hwasan_check_memaccess_shortgranules + : Intrinsic::hwasan_check_memaccess), + {shadowBase(), Ptr, ConstantInt::get(Int32Ty, AccessInfo)}); return; } @@ -718,7 +770,9 @@ static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) { bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size) { - size_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment()); + size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); + if (!UseShortGranules) + Size = AlignedSize; Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty()); if (ClInstrumentWithCalls) { @@ -735,10 +789,10 @@ bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, // llvm.memset right here into either a sequence of stores, or a call to // hwasan_tag_memory. if (ShadowSize) - IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, /*Align=*/1); + IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align::None()); if (Size != AlignedSize) { IRB.CreateStore( - ConstantInt::get(Int8Ty, Size % Mapping.getAllocaAlignment()), + ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()), IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize)); IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32( Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy), @@ -778,8 +832,9 @@ Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) { // FIXME: use addressofreturnaddress (but implement it in aarch64 backend // first). Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - auto GetStackPointerFn = - Intrinsic::getDeclaration(M, Intrinsic::frameaddress); + auto GetStackPointerFn = Intrinsic::getDeclaration( + M, Intrinsic::frameaddress, + IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace())); Value *StackPointer = IRB.CreateCall( GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())}); @@ -876,34 +931,13 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy); assert(SlotPtr); - Instruction *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); - - Function *F = IRB.GetInsertBlock()->getParent(); - if (F->getFnAttribute("hwasan-abi").getValueAsString() == "interceptor") { - Value *ThreadLongEqZero = - IRB.CreateICmpEQ(ThreadLong, ConstantInt::get(IntptrTy, 0)); - auto *Br = cast<BranchInst>(SplitBlockAndInsertIfThen( - ThreadLongEqZero, cast<Instruction>(ThreadLongEqZero)->getNextNode(), - false, MDBuilder(*C).createBranchWeights(1, 100000))); - - IRB.SetInsertPoint(Br); - // FIXME: This should call a new runtime function with a custom calling - // convention to avoid needing to spill all arguments here. - IRB.CreateCall(HwasanThreadEnterFunc); - LoadInst *ReloadThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); - - IRB.SetInsertPoint(&*Br->getSuccessor(0)->begin()); - PHINode *ThreadLongPhi = IRB.CreatePHI(IntptrTy, 2); - ThreadLongPhi->addIncoming(ThreadLong, ThreadLong->getParent()); - ThreadLongPhi->addIncoming(ReloadThreadLong, ReloadThreadLong->getParent()); - ThreadLong = ThreadLongPhi; - } - + Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr); // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI. Value *ThreadLongMaybeUntagged = TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong); if (WithFrameRecord) { + Function *F = IRB.GetInsertBlock()->getParent(); StackBaseTag = IRB.CreateAShr(ThreadLong, 3); // Prepare ring buffer data. @@ -912,8 +946,10 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) { PC = readRegister(IRB, "pc"); else PC = IRB.CreatePtrToInt(F, IntptrTy); - auto GetStackPointerFn = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::frameaddress); + Module *M = F->getParent(); + auto GetStackPointerFn = Intrinsic::getDeclaration( + M, Intrinsic::frameaddress, + IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace())); Value *SP = IRB.CreatePtrToInt( IRB.CreateCall(GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())}), @@ -980,7 +1016,7 @@ bool HWAddressSanitizer::instrumentLandingPads( bool HWAddressSanitizer::instrumentStack( SmallVectorImpl<AllocaInst *> &Allocas, - DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> &AllocaDeclareMap, + DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap, SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) { // Ideally, we want to calculate tagged stack base pointer, and rewrite all // alloca addresses using that. Unfortunately, offsets are not known yet @@ -999,17 +1035,18 @@ bool HWAddressSanitizer::instrumentStack( AI->hasName() ? AI->getName().str() : "alloca." + itostr(N); Replacement->setName(Name + ".hwasan"); - for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) { - Use &U = *UI++; - if (U.getUser() != AILong) - U.set(Replacement); - } - - for (auto *DDI : AllocaDeclareMap.lookup(AI)) { - DIExpression *OldExpr = DDI->getExpression(); - DIExpression *NewExpr = DIExpression::append( - OldExpr, {dwarf::DW_OP_LLVM_tag_offset, RetagMask(N)}); - DDI->setArgOperand(2, MetadataAsValue::get(*C, NewExpr)); + AI->replaceUsesWithIf(Replacement, + [AILong](Use &U) { return U.getUser() != AILong; }); + + for (auto *DDI : AllocaDbgMap.lookup(AI)) { + // Prepend "tag_offset, N" to the dwarf expression. + // Tag offset logically applies to the alloca pointer, and it makes sense + // to put it at the beginning of the expression. + SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset, + RetagMask(N)}; + DDI->setArgOperand( + 2, MetadataAsValue::get(*C, DIExpression::prependOpcodes( + DDI->getExpression(), NewOps))); } size_t Size = getAllocaSizeInBytes(*AI); @@ -1020,7 +1057,7 @@ bool HWAddressSanitizer::instrumentStack( // Re-tag alloca memory with the special UAR tag. Value *Tag = getUARTag(IRB, StackTag); - tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getAllocaAlignment())); + tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment())); } } @@ -1056,7 +1093,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { SmallVector<AllocaInst*, 8> AllocasToInstrument; SmallVector<Instruction*, 8> RetVec; SmallVector<Instruction*, 8> LandingPadVec; - DenseMap<AllocaInst *, std::vector<DbgDeclareInst *>> AllocaDeclareMap; + DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap; for (auto &BB : F) { for (auto &Inst : BB) { if (ClInstrumentStack) @@ -1070,11 +1107,12 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { isa<CleanupReturnInst>(Inst)) RetVec.push_back(&Inst); - if (auto *DDI = dyn_cast<DbgDeclareInst>(&Inst)) - if (auto *Alloca = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) - AllocaDeclareMap[Alloca].push_back(DDI); + if (auto *DDI = dyn_cast<DbgVariableIntrinsic>(&Inst)) + if (auto *Alloca = + dyn_cast_or_null<AllocaInst>(DDI->getVariableLocation())) + AllocaDbgMap[Alloca].push_back(DDI); - if (ClInstrumentLandingPads && isa<LandingPadInst>(Inst)) + if (InstrumentLandingPads && isa<LandingPadInst>(Inst)) LandingPadVec.push_back(&Inst); Value *MaybeMask = nullptr; @@ -1093,6 +1131,13 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { if (!LandingPadVec.empty()) instrumentLandingPads(LandingPadVec); + if (AllocasToInstrument.empty() && F.hasPersonalityFn() && + F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) { + // __hwasan_personality_thunk is a no-op for functions without an + // instrumented stack, so we can drop it. + F.setPersonalityFn(nullptr); + } + if (AllocasToInstrument.empty() && ToInstrument.empty()) return false; @@ -1108,7 +1153,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { if (!AllocasToInstrument.empty()) { Value *StackTag = ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB); - Changed |= instrumentStack(AllocasToInstrument, AllocaDeclareMap, RetVec, + Changed |= instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag); } @@ -1118,8 +1163,9 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap; for (AllocaInst *AI : AllocasToInstrument) { uint64_t Size = getAllocaSizeInBytes(*AI); - uint64_t AlignedSize = alignTo(Size, Mapping.getAllocaAlignment()); - AI->setAlignment(std::max(AI->getAlignment(), 16u)); + uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment()); + AI->setAlignment( + MaybeAlign(std::max(AI->getAlignment(), Mapping.getObjectAlignment()))); if (Size != AlignedSize) { Type *AllocatedType = AI->getAllocatedType(); if (AI->isArrayAllocation()) { @@ -1132,7 +1178,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { auto *NewAI = new AllocaInst( TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI); NewAI->takeName(AI); - NewAI->setAlignment(AI->getAlignment()); + NewAI->setAlignment(MaybeAlign(AI->getAlignment())); NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca()); NewAI->setSwiftError(AI->isSwiftError()); NewAI->copyMetadata(*AI); @@ -1179,6 +1225,257 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) { return Changed; } +void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) { + Constant *Initializer = GV->getInitializer(); + uint64_t SizeInBytes = + M.getDataLayout().getTypeAllocSize(Initializer->getType()); + uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment()); + if (SizeInBytes != NewSize) { + // Pad the initializer out to the next multiple of 16 bytes and add the + // required short granule tag. + std::vector<uint8_t> Init(NewSize - SizeInBytes, 0); + Init.back() = Tag; + Constant *Padding = ConstantDataArray::get(*C, Init); + Initializer = ConstantStruct::getAnon({Initializer, Padding}); + } + + auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(), + GlobalValue::ExternalLinkage, Initializer, + GV->getName() + ".hwasan"); + NewGV->copyAttributesFrom(GV); + NewGV->setLinkage(GlobalValue::PrivateLinkage); + NewGV->copyMetadata(GV, 0); + NewGV->setAlignment( + MaybeAlign(std::max(GV->getAlignment(), Mapping.getObjectAlignment()))); + + // It is invalid to ICF two globals that have different tags. In the case + // where the size of the global is a multiple of the tag granularity the + // contents of the globals may be the same but the tags (i.e. symbol values) + // may be different, and the symbols are not considered during ICF. In the + // case where the size is not a multiple of the granularity, the short granule + // tags would discriminate two globals with different tags, but there would + // otherwise be nothing stopping such a global from being incorrectly ICF'd + // with an uninstrumented (i.e. tag 0) global that happened to have the short + // granule tag in the last byte. + NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None); + + // Descriptor format (assuming little-endian): + // bytes 0-3: relative address of global + // bytes 4-6: size of global (16MB ought to be enough for anyone, but in case + // it isn't, we create multiple descriptors) + // byte 7: tag + auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty); + const uint64_t MaxDescriptorSize = 0xfffff0; + for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes; + DescriptorPos += MaxDescriptorSize) { + auto *Descriptor = + new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage, + nullptr, GV->getName() + ".hwasan.descriptor"); + auto *GVRelPtr = ConstantExpr::getTrunc( + ConstantExpr::getAdd( + ConstantExpr::getSub( + ConstantExpr::getPtrToInt(NewGV, Int64Ty), + ConstantExpr::getPtrToInt(Descriptor, Int64Ty)), + ConstantInt::get(Int64Ty, DescriptorPos)), + Int32Ty); + uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize); + auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24)); + Descriptor->setComdat(NewGV->getComdat()); + Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag})); + Descriptor->setSection("hwasan_globals"); + Descriptor->setMetadata(LLVMContext::MD_associated, + MDNode::get(*C, ValueAsMetadata::get(NewGV))); + appendToCompilerUsed(M, Descriptor); + } + + Constant *Aliasee = ConstantExpr::getIntToPtr( + ConstantExpr::getAdd( + ConstantExpr::getPtrToInt(NewGV, Int64Ty), + ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)), + GV->getType()); + auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(), + GV->getLinkage(), "", Aliasee, &M); + Alias->setVisibility(GV->getVisibility()); + Alias->takeName(GV); + GV->replaceAllUsesWith(Alias); + GV->eraseFromParent(); +} + +void HWAddressSanitizer::instrumentGlobals() { + // Start by creating a note that contains pointers to the list of global + // descriptors. Adding a note to the output file will cause the linker to + // create a PT_NOTE program header pointing to the note that we can use to + // find the descriptor list starting from the program headers. A function + // provided by the runtime initializes the shadow memory for the globals by + // accessing the descriptor list via the note. The dynamic loader needs to + // call this function whenever a library is loaded. + // + // The reason why we use a note for this instead of a more conventional + // approach of having a global constructor pass a descriptor list pointer to + // the runtime is because of an order of initialization problem. With + // constructors we can encounter the following problematic scenario: + // + // 1) library A depends on library B and also interposes one of B's symbols + // 2) B's constructors are called before A's (as required for correctness) + // 3) during construction, B accesses one of its "own" globals (actually + // interposed by A) and triggers a HWASAN failure due to the initialization + // for A not having happened yet + // + // Even without interposition it is possible to run into similar situations in + // cases where two libraries mutually depend on each other. + // + // We only need one note per binary, so put everything for the note in a + // comdat. + Comdat *NoteComdat = M.getOrInsertComdat(kHwasanNoteName); + + Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0); + auto Start = + new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage, + nullptr, "__start_hwasan_globals"); + Start->setVisibility(GlobalValue::HiddenVisibility); + Start->setDSOLocal(true); + auto Stop = + new GlobalVariable(M, Int8Arr0Ty, true, GlobalVariable::ExternalLinkage, + nullptr, "__stop_hwasan_globals"); + Stop->setVisibility(GlobalValue::HiddenVisibility); + Stop->setDSOLocal(true); + + // Null-terminated so actually 8 bytes, which are required in order to align + // the note properly. + auto *Name = ConstantDataArray::get(*C, "LLVM\0\0\0"); + + auto *NoteTy = StructType::get(Int32Ty, Int32Ty, Int32Ty, Name->getType(), + Int32Ty, Int32Ty); + auto *Note = + new GlobalVariable(M, NoteTy, /*isConstantGlobal=*/true, + GlobalValue::PrivateLinkage, nullptr, kHwasanNoteName); + Note->setSection(".note.hwasan.globals"); + Note->setComdat(NoteComdat); + Note->setAlignment(Align(4)); + Note->setDSOLocal(true); + + // The pointers in the note need to be relative so that the note ends up being + // placed in rodata, which is the standard location for notes. + auto CreateRelPtr = [&](Constant *Ptr) { + return ConstantExpr::getTrunc( + ConstantExpr::getSub(ConstantExpr::getPtrToInt(Ptr, Int64Ty), + ConstantExpr::getPtrToInt(Note, Int64Ty)), + Int32Ty); + }; + Note->setInitializer(ConstantStruct::getAnon( + {ConstantInt::get(Int32Ty, 8), // n_namesz + ConstantInt::get(Int32Ty, 8), // n_descsz + ConstantInt::get(Int32Ty, ELF::NT_LLVM_HWASAN_GLOBALS), // n_type + Name, CreateRelPtr(Start), CreateRelPtr(Stop)})); + appendToCompilerUsed(M, Note); + + // Create a zero-length global in hwasan_globals so that the linker will + // always create start and stop symbols. + auto Dummy = new GlobalVariable( + M, Int8Arr0Ty, /*isConstantGlobal*/ true, GlobalVariable::PrivateLinkage, + Constant::getNullValue(Int8Arr0Ty), "hwasan.dummy.global"); + Dummy->setSection("hwasan_globals"); + Dummy->setComdat(NoteComdat); + Dummy->setMetadata(LLVMContext::MD_associated, + MDNode::get(*C, ValueAsMetadata::get(Note))); + appendToCompilerUsed(M, Dummy); + + std::vector<GlobalVariable *> Globals; + for (GlobalVariable &GV : M.globals()) { + if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") || + GV.isThreadLocal()) + continue; + + // Common symbols can't have aliases point to them, so they can't be tagged. + if (GV.hasCommonLinkage()) + continue; + + // Globals with custom sections may be used in __start_/__stop_ enumeration, + // which would be broken both by adding tags and potentially by the extra + // padding/alignment that we insert. + if (GV.hasSection()) + continue; + + Globals.push_back(&GV); + } + + MD5 Hasher; + Hasher.update(M.getSourceFileName()); + MD5::MD5Result Hash; + Hasher.final(Hash); + uint8_t Tag = Hash[0]; + + for (GlobalVariable *GV : Globals) { + // Skip tag 0 in order to avoid collisions with untagged memory. + if (Tag == 0) + Tag = 1; + instrumentGlobal(GV, Tag++); + } +} + +void HWAddressSanitizer::instrumentPersonalityFunctions() { + // We need to untag stack frames as we unwind past them. That is the job of + // the personality function wrapper, which either wraps an existing + // personality function or acts as a personality function on its own. Each + // function that has a personality function or that can be unwound past has + // its personality function changed to a thunk that calls the personality + // function wrapper in the runtime. + MapVector<Constant *, std::vector<Function *>> PersonalityFns; + for (Function &F : M) { + if (F.isDeclaration() || !F.hasFnAttribute(Attribute::SanitizeHWAddress)) + continue; + + if (F.hasPersonalityFn()) { + PersonalityFns[F.getPersonalityFn()->stripPointerCasts()].push_back(&F); + } else if (!F.hasFnAttribute(Attribute::NoUnwind)) { + PersonalityFns[nullptr].push_back(&F); + } + } + + if (PersonalityFns.empty()) + return; + + FunctionCallee HwasanPersonalityWrapper = M.getOrInsertFunction( + "__hwasan_personality_wrapper", Int32Ty, Int32Ty, Int32Ty, Int64Ty, + Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy); + FunctionCallee UnwindGetGR = M.getOrInsertFunction("_Unwind_GetGR", VoidTy); + FunctionCallee UnwindGetCFA = M.getOrInsertFunction("_Unwind_GetCFA", VoidTy); + + for (auto &P : PersonalityFns) { + std::string ThunkName = kHwasanPersonalityThunkName; + if (P.first) + ThunkName += ("." + P.first->getName()).str(); + FunctionType *ThunkFnTy = FunctionType::get( + Int32Ty, {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int8PtrTy}, false); + bool IsLocal = P.first && (!isa<GlobalValue>(P.first) || + cast<GlobalValue>(P.first)->hasLocalLinkage()); + auto *ThunkFn = Function::Create(ThunkFnTy, + IsLocal ? GlobalValue::InternalLinkage + : GlobalValue::LinkOnceODRLinkage, + ThunkName, &M); + if (!IsLocal) { + ThunkFn->setVisibility(GlobalValue::HiddenVisibility); + ThunkFn->setComdat(M.getOrInsertComdat(ThunkName)); + } + + auto *BB = BasicBlock::Create(*C, "entry", ThunkFn); + IRBuilder<> IRB(BB); + CallInst *WrapperCall = IRB.CreateCall( + HwasanPersonalityWrapper, + {ThunkFn->getArg(0), ThunkFn->getArg(1), ThunkFn->getArg(2), + ThunkFn->getArg(3), ThunkFn->getArg(4), + P.first ? IRB.CreateBitCast(P.first, Int8PtrTy) + : Constant::getNullValue(Int8PtrTy), + IRB.CreateBitCast(UnwindGetGR.getCallee(), Int8PtrTy), + IRB.CreateBitCast(UnwindGetCFA.getCallee(), Int8PtrTy)}); + WrapperCall->setTailCall(); + IRB.CreateRet(WrapperCall); + + for (Function *F : P.second) + F->setPersonalityFn(ThunkFn); + } +} + void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) { Scale = kDefaultShadowScale; if (ClMappingOffset.getNumOccurrences() > 0) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp index c7371f567ff3..d5787c8f62a1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp @@ -36,6 +36,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" @@ -403,7 +404,7 @@ static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI, AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); } else { - OwnedORE = llvm::make_unique<OptimizationRemarkEmitter>(&F); + OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F); ORE = OwnedORE.get(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp index a2c1ddfd279e..518b8895e836 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp @@ -9,6 +9,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" @@ -19,6 +20,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/ProfileData/InstrProf.h" @@ -28,7 +30,6 @@ #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" -#include "llvm/Transforms/Instrumentation/InstrOrderFile.h" #include <fstream> #include <map> #include <mutex> @@ -100,7 +101,8 @@ public: if (!ClOrderFileWriteMapping.empty()) { std::lock_guard<std::mutex> LogLock(MappingMutex); std::error_code EC; - llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC, llvm::sys::fs::F_Append); + llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC, + llvm::sys::fs::OF_Append); if (EC) { report_fatal_error(Twine("Failed to open ") + ClOrderFileWriteMapping + " to save mapping file for order file instrumentation\n"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 5d3a793dda19..04c7e856b5d4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" @@ -157,7 +158,10 @@ public: } bool runOnModule(Module &M) override { - return InstrProf.run(M, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()); + auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { + return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + }; + return InstrProf.run(M, GetTLI); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -370,8 +374,12 @@ private: } // end anonymous namespace PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) { - auto &TLI = AM.getResult<TargetLibraryAnalysis>(M); - if (!run(M, TLI)) + FunctionAnalysisManager &FAM = + AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { + return FAM.getResult<TargetLibraryAnalysis>(F); + }; + if (!run(M, GetTLI)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -441,7 +449,7 @@ void InstrProfiling::promoteCounterLoadStores(Function *F) { std::unique_ptr<BlockFrequencyInfo> BFI; if (Options.UseBFIInPromotion) { std::unique_ptr<BranchProbabilityInfo> BPI; - BPI.reset(new BranchProbabilityInfo(*F, LI, TLI)); + BPI.reset(new BranchProbabilityInfo(*F, LI, &GetTLI(*F))); BFI.reset(new BlockFrequencyInfo(*F, *BPI, LI)); } @@ -482,9 +490,10 @@ static bool containsProfilingIntrinsics(Module &M) { return false; } -bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) { +bool InstrProfiling::run( + Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) { this->M = &M; - this->TLI = &TLI; + this->GetTLI = std::move(GetTLI); NamesVar = nullptr; NamesSize = 0; ProfileDataMap.clear(); @@ -601,6 +610,7 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) { bool IsRange = (Ind->getValueKind()->getZExtValue() == llvm::InstrProfValueKind::IPVK_MemOPSize); CallInst *Call = nullptr; + auto *TLI = &GetTLI(*Ind->getFunction()); if (!IsRange) { Value *Args[3] = {Ind->getTargetValue(), Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()), @@ -749,7 +759,6 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { // of the parent function, that will result in relocations against discarded // sections. bool NeedComdat = needsComdatForCounter(*Fn, *M); - Comdat *Cmdt = nullptr; // Comdat group. if (NeedComdat) { if (TT.isOSBinFormatCOFF()) { // For COFF, put the counters, data, and values each into their own @@ -758,14 +767,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { // with the same name marked IMAGE_COMDAT_SELECT_ASSOCIATIVE. Linkage = GlobalValue::LinkOnceODRLinkage; Visibility = GlobalValue::HiddenVisibility; - } else { - // Otherwise, create one comdat group for everything. - Cmdt = M->getOrInsertComdat(getVarName(Inc, getInstrProfComdatPrefix())); } } auto MaybeSetComdat = [=](GlobalVariable *GV) { if (NeedComdat) - GV->setComdat(Cmdt ? Cmdt : M->getOrInsertComdat(GV->getName())); + GV->setComdat(M->getOrInsertComdat(GV->getName())); }; uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); @@ -780,7 +786,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { CounterPtr->setVisibility(Visibility); CounterPtr->setSection( getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat())); - CounterPtr->setAlignment(8); + CounterPtr->setAlignment(Align(8)); MaybeSetComdat(CounterPtr); CounterPtr->setLinkage(Linkage); @@ -802,7 +808,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { ValuesVar->setVisibility(Visibility); ValuesVar->setSection( getInstrProfSectionName(IPSK_vals, TT.getObjectFormat())); - ValuesVar->setAlignment(8); + ValuesVar->setAlignment(Align(8)); MaybeSetComdat(ValuesVar); ValuesPtrExpr = ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx)); @@ -835,7 +841,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) { getVarName(Inc, getInstrProfDataVarPrefix())); Data->setVisibility(Visibility); Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat())); - Data->setAlignment(INSTR_PROF_DATA_ALIGNMENT); + Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT)); MaybeSetComdat(Data); Data->setLinkage(Linkage); @@ -926,7 +932,7 @@ void InstrProfiling::emitNameData() { // On COFF, it's important to reduce the alignment down to 1 to prevent the // linker from inserting padding before the start of the names section or // between names entries. - NamesVar->setAlignment(1); + NamesVar->setAlignment(Align::None()); UsedVars.push_back(NamesVar); for (auto *NamePtr : ReferencedNames) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index f56a1bd91b89..a6c2c9b464b6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -68,7 +68,8 @@ GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str, GlobalValue::PrivateLinkage, StrConst, NamePrefix); if (AllowMerging) GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(1); // Strings may not be merged w/o setting align 1. + GV->setAlignment(Align::None()); // Strings may not be merged w/o setting + // alignment explicitly. return GV; } @@ -116,7 +117,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeMemorySanitizerLegacyPassPass(Registry); initializeHWAddressSanitizerLegacyPassPass(Registry); initializeThreadSanitizerLegacyPassPass(Registry); - initializeSanitizerCoverageModulePass(Registry); + initializeModuleSanitizerCoverageLegacyPassPass(Registry); initializeDataFlowSanitizerPass(Registry); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index b25cbed1bb02..f581142df8f7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -170,12 +170,14 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueMap.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" @@ -202,8 +204,8 @@ using namespace llvm; #define DEBUG_TYPE "msan" static const unsigned kOriginSize = 4; -static const unsigned kMinOriginAlignment = 4; -static const unsigned kShadowTLSAlignment = 8; +static const Align kMinOriginAlignment = Align(4); +static const Align kShadowTLSAlignment = Align(8); // These constants must be kept in sync with the ones in msan.h. static const unsigned kParamTLSSize = 800; @@ -462,16 +464,9 @@ namespace { /// the module. class MemorySanitizer { public: - MemorySanitizer(Module &M, MemorySanitizerOptions Options) { - this->CompileKernel = - ClEnableKmsan.getNumOccurrences() > 0 ? ClEnableKmsan : Options.Kernel; - if (ClTrackOrigins.getNumOccurrences() > 0) - this->TrackOrigins = ClTrackOrigins; - else - this->TrackOrigins = this->CompileKernel ? 2 : Options.TrackOrigins; - this->Recover = ClKeepGoing.getNumOccurrences() > 0 - ? ClKeepGoing - : (this->CompileKernel | Options.Recover); + MemorySanitizer(Module &M, MemorySanitizerOptions Options) + : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins), + Recover(Options.Recover) { initializeModule(M); } @@ -594,10 +589,26 @@ private: /// An empty volatile inline asm that prevents callback merge. InlineAsm *EmptyAsm; - - Function *MsanCtorFunction; }; +void insertModuleCtor(Module &M) { + getOrCreateSanitizerCtorAndInitFunctions( + M, kMsanModuleCtorName, kMsanInitName, + /*InitArgTypes=*/{}, + /*InitArgs=*/{}, + // This callback is invoked when the functions are created the first + // time. Hook them into the global ctors list in that case: + [&](Function *Ctor, FunctionCallee) { + if (!ClWithComdat) { + appendToGlobalCtors(M, Ctor, 0); + return; + } + Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName); + Ctor->setComdat(MsanCtorComdat); + appendToGlobalCtors(M, Ctor, 0, Ctor); + }); +} + /// A legacy function pass for msan instrumentation. /// /// Instruments functions to detect unitialized reads. @@ -615,7 +626,7 @@ struct MemorySanitizerLegacyPass : public FunctionPass { bool runOnFunction(Function &F) override { return MSan->sanitizeFunction( - F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI()); + F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F)); } bool doInitialization(Module &M) override; @@ -623,8 +634,17 @@ struct MemorySanitizerLegacyPass : public FunctionPass { MemorySanitizerOptions Options; }; +template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) { + return (Opt.getNumOccurrences() > 0) ? Opt : Default; +} + } // end anonymous namespace +MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K) + : Kernel(getOptOrDefault(ClEnableKmsan, K)), + TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)), + Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {} + PreservedAnalyses MemorySanitizerPass::run(Function &F, FunctionAnalysisManager &FAM) { MemorySanitizer Msan(*F.getParent(), Options); @@ -633,6 +653,14 @@ PreservedAnalyses MemorySanitizerPass::run(Function &F, return PreservedAnalyses::all(); } +PreservedAnalyses MemorySanitizerPass::run(Module &M, + ModuleAnalysisManager &AM) { + if (Options.Kernel) + return PreservedAnalyses::all(); + insertModuleCtor(M); + return PreservedAnalyses::none(); +} + char MemorySanitizerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan", @@ -918,23 +946,6 @@ void MemorySanitizer::initializeModule(Module &M) { OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000); if (!CompileKernel) { - std::tie(MsanCtorFunction, std::ignore) = - getOrCreateSanitizerCtorAndInitFunctions( - M, kMsanModuleCtorName, kMsanInitName, - /*InitArgTypes=*/{}, - /*InitArgs=*/{}, - // This callback is invoked when the functions are created the first - // time. Hook them into the global ctors list in that case: - [&](Function *Ctor, FunctionCallee) { - if (!ClWithComdat) { - appendToGlobalCtors(M, Ctor, 0); - return; - } - Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName); - Ctor->setComdat(MsanCtorComdat); - appendToGlobalCtors(M, Ctor, 0, Ctor); - }); - if (TrackOrigins) M.getOrInsertGlobal("__msan_track_origins", IRB.getInt32Ty(), [&] { return new GlobalVariable( @@ -952,6 +963,8 @@ void MemorySanitizer::initializeModule(Module &M) { } bool MemorySanitizerLegacyPass::doInitialization(Module &M) { + if (!Options.Kernel) + insertModuleCtor(M); MSan.emplace(M, Options); return true; } @@ -1075,15 +1088,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// Fill memory range with the given origin value. void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr, - unsigned Size, unsigned Alignment) { + unsigned Size, Align Alignment) { const DataLayout &DL = F.getParent()->getDataLayout(); - unsigned IntptrAlignment = DL.getABITypeAlignment(MS.IntptrTy); + const Align IntptrAlignment = Align(DL.getABITypeAlignment(MS.IntptrTy)); unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy); assert(IntptrAlignment >= kMinOriginAlignment); assert(IntptrSize >= kOriginSize); unsigned Ofs = 0; - unsigned CurrentAlignment = Alignment; + Align CurrentAlignment = Alignment; if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) { Value *IntptrOrigin = originToIntptr(IRB, Origin); Value *IntptrOriginPtr = @@ -1091,7 +1104,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { for (unsigned i = 0; i < Size / IntptrSize; ++i) { Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i) : IntptrOriginPtr; - IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment); + IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment.value()); Ofs += IntptrSize / kOriginSize; CurrentAlignment = IntptrAlignment; } @@ -1100,23 +1113,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) { Value *GEP = i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr; - IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment); + IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment.value()); CurrentAlignment = kMinOriginAlignment; } } void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin, - Value *OriginPtr, unsigned Alignment, bool AsCall) { + Value *OriginPtr, Align Alignment, bool AsCall) { const DataLayout &DL = F.getParent()->getDataLayout(); - unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); + const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); if (Shadow->getType()->isAggregateType()) { paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, OriginAlignment); } else { Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); - Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow); - if (ConstantShadow) { + if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) { if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize, OriginAlignment); @@ -1153,12 +1165,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val); Value *ShadowPtr, *OriginPtr; Type *ShadowTy = Shadow->getType(); - unsigned Alignment = SI->getAlignment(); - unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); + const Align Alignment = assumeAligned(SI->getAlignment()); + const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true); - StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment); + StoreInst *NewSI = + IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment.value()); LLVM_DEBUG(dbgs() << " STORE: " << *NewSI << "\n"); (void)NewSI; @@ -1196,8 +1209,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB); LLVM_DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n"); - Constant *ConstantShadow = dyn_cast_or_null<Constant>(ConvertedShadow); - if (ConstantShadow) { + if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) { if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) { insertWarningFn(IRB, Origin); } @@ -1392,10 +1404,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { /// /// Shadow = ShadowBase + Offset /// Origin = (OriginBase + Offset) & ~3ULL - std::pair<Value *, Value *> getShadowOriginPtrUserspace(Value *Addr, - IRBuilder<> &IRB, - Type *ShadowTy, - unsigned Alignment) { + std::pair<Value *, Value *> + getShadowOriginPtrUserspace(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, + MaybeAlign Alignment) { Value *ShadowOffset = getShadowPtrOffset(Addr, IRB); Value *ShadowLong = ShadowOffset; uint64_t ShadowBase = MS.MapParams->ShadowBase; @@ -1413,8 +1424,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (OriginBase != 0) OriginLong = IRB.CreateAdd(OriginLong, ConstantInt::get(MS.IntptrTy, OriginBase)); - if (Alignment < kMinOriginAlignment) { - uint64_t Mask = kMinOriginAlignment - 1; + if (!Alignment || *Alignment < kMinOriginAlignment) { + uint64_t Mask = kMinOriginAlignment.value() - 1; OriginLong = IRB.CreateAnd(OriginLong, ConstantInt::get(MS.IntptrTy, ~Mask)); } @@ -1424,9 +1435,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return std::make_pair(ShadowPtr, OriginPtr); } - std::pair<Value *, Value *> - getShadowOriginPtrKernel(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, - unsigned Alignment, bool isStore) { + std::pair<Value *, Value *> getShadowOriginPtrKernel(Value *Addr, + IRBuilder<> &IRB, + Type *ShadowTy, + bool isStore) { Value *ShadowOriginPtrs; const DataLayout &DL = F.getParent()->getDataLayout(); int Size = DL.getTypeStoreSize(ShadowTy); @@ -1451,14 +1463,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy, - unsigned Alignment, + MaybeAlign Alignment, bool isStore) { - std::pair<Value *, Value *> ret; if (MS.CompileKernel) - ret = getShadowOriginPtrKernel(Addr, IRB, ShadowTy, Alignment, isStore); - else - ret = getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment); - return ret; + return getShadowOriginPtrKernel(Addr, IRB, ShadowTy, isStore); + return getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment); } /// Compute the shadow address for a given function argument. @@ -1608,11 +1617,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // ByVal pointer itself has clean shadow. We copy the actual // argument shadow to the underlying memory. // Figure out maximal valid memcpy alignment. - unsigned ArgAlign = FArg.getParamAlignment(); - if (ArgAlign == 0) { - Type *EltType = A->getType()->getPointerElementType(); - ArgAlign = DL.getABITypeAlignment(EltType); - } + const Align ArgAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(FArg.getParamAlignment()), + A->getType()->getPointerElementType()); Value *CpShadowPtr = getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign, /*isStore*/ true) @@ -1624,7 +1631,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()), Size, ArgAlign); } else { - unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); + const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment); Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base, CopyAlign, Size); LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n"); @@ -1636,8 +1643,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // ParamTLS overflow. *ShadowPtr = getCleanShadow(V); } else { - *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base, - kShadowTLSAlignment); + *ShadowPtr = EntryIRB.CreateAlignedLoad( + getShadowTy(&FArg), Base, kShadowTLSAlignment.value()); } } LLVM_DEBUG(dbgs() @@ -1771,13 +1778,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(I.getNextNode()); Type *ShadowTy = getShadowTy(&I); Value *Addr = I.getPointerOperand(); - Value *ShadowPtr, *OriginPtr; - unsigned Alignment = I.getAlignment(); + Value *ShadowPtr = nullptr, *OriginPtr = nullptr; + const Align Alignment = assumeAligned(I.getAlignment()); if (PropagateShadow) { std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); - setShadow(&I, - IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld")); + setShadow(&I, IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, + Alignment.value(), "_msld")); } else { setShadow(&I, getCleanShadow(&I)); } @@ -1790,9 +1797,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (MS.TrackOrigins) { if (PropagateShadow) { - unsigned OriginAlignment = std::max(kMinOriginAlignment, Alignment); - setOrigin( - &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment)); + const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment); + setOrigin(&I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, + OriginAlignment.value())); } else { setOrigin(&I, getCleanOrigin()); } @@ -1814,8 +1821,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *Addr = I.getOperand(0); - Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), - /*Alignment*/ 1, /*isStore*/ true) + Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align::None(), + /*isStore*/ true) .first; if (ClCheckAccessAddress) @@ -2447,7 +2454,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // We don't know the pointer alignment (could be unaligned SSE store!). // Have to assume to worst case. std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr( - Addr, IRB, Shadow->getType(), /*Alignment*/ 1, /*isStore*/ true); + Addr, IRB, Shadow->getType(), Align::None(), /*isStore*/ true); IRB.CreateAlignedStore(Shadow, ShadowPtr, 1); if (ClCheckAccessAddress) @@ -2467,15 +2474,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *Addr = I.getArgOperand(0); Type *ShadowTy = getShadowTy(&I); - Value *ShadowPtr, *OriginPtr; + Value *ShadowPtr = nullptr, *OriginPtr = nullptr; if (PropagateShadow) { // We don't know the pointer alignment (could be unaligned SSE load!). // Have to assume to worst case. - unsigned Alignment = 1; + const Align Alignment = Align::None(); std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); - setShadow(&I, - IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld")); + setShadow(&I, IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, + Alignment.value(), "_msld")); } else { setShadow(&I, getCleanShadow(&I)); } @@ -2562,6 +2569,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { return false; } + void handleInvariantGroup(IntrinsicInst &I) { + setShadow(&I, getShadow(&I, 0)); + setOrigin(&I, getOrigin(&I, 0)); + } + void handleLifetimeStart(IntrinsicInst &I) { if (!PoisonStack) return; @@ -2857,7 +2869,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value* Addr = I.getArgOperand(0); Type *Ty = IRB.getInt32Ty(); Value *ShadowPtr = - getShadowOriginPtr(Addr, IRB, Ty, /*Alignment*/ 1, /*isStore*/ true) + getShadowOriginPtr(Addr, IRB, Ty, Align::None(), /*isStore*/ true) .first; IRB.CreateStore(getCleanShadow(Ty), @@ -2873,7 +2885,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *Addr = I.getArgOperand(0); Type *Ty = IRB.getInt32Ty(); - unsigned Alignment = 1; + const Align Alignment = Align::None(); Value *ShadowPtr, *OriginPtr; std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false); @@ -2881,7 +2893,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (ClCheckAccessAddress) insertShadowCheck(Addr, &I); - Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr"); + Value *Shadow = + IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment.value(), "_ldmxcsr"); Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr) : getCleanOrigin(); insertShadowCheck(Shadow, Origin, &I); @@ -2891,14 +2904,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRB(&I); Value *V = I.getArgOperand(0); Value *Addr = I.getArgOperand(1); - unsigned Align = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); + const MaybeAlign Alignment( + cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()); Value *Mask = I.getArgOperand(3); Value *Shadow = getShadow(V); Value *ShadowPtr; Value *OriginPtr; std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr( - Addr, IRB, Shadow->getType(), Align, /*isStore*/ true); + Addr, IRB, Shadow->getType(), Alignment, /*isStore*/ true); if (ClCheckAccessAddress) { insertShadowCheck(Addr, &I); @@ -2907,20 +2921,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { insertShadowCheck(Mask, &I); } - IRB.CreateMaskedStore(Shadow, ShadowPtr, Align, Mask); + IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment ? Alignment->value() : 0, + Mask); if (MS.TrackOrigins) { auto &DL = F.getParent()->getDataLayout(); paintOrigin(IRB, getOrigin(V), OriginPtr, DL.getTypeStoreSize(Shadow->getType()), - std::max(Align, kMinOriginAlignment)); + llvm::max(Alignment, kMinOriginAlignment)); } } bool handleMaskedLoad(IntrinsicInst &I) { IRBuilder<> IRB(&I); Value *Addr = I.getArgOperand(0); - unsigned Align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + const MaybeAlign Alignment( + cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); Value *Mask = I.getArgOperand(2); Value *PassThru = I.getArgOperand(3); @@ -2928,9 +2944,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Value *ShadowPtr, *OriginPtr; if (PropagateShadow) { std::tie(ShadowPtr, OriginPtr) = - getShadowOriginPtr(Addr, IRB, ShadowTy, Align, /*isStore*/ false); - setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Align, Mask, - getShadow(PassThru), "_msmaskedld")); + getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false); + setShadow(&I, IRB.CreateMaskedLoad( + ShadowPtr, Alignment ? Alignment->value() : 0, Mask, + getShadow(PassThru), "_msmaskedld")); } else { setShadow(&I, getCleanShadow(&I)); } @@ -2988,11 +3005,52 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { setOriginForNaryOp(I); } + Constant *getPclmulMask(IRBuilder<> &IRB, unsigned Width, bool OddElements) { + SmallVector<Constant *, 8> Mask; + for (unsigned X = OddElements ? 1 : 0; X < Width; X += 2) { + Constant *C = ConstantInt::get(IRB.getInt32Ty(), X); + Mask.push_back(C); + Mask.push_back(C); + } + return ConstantVector::get(Mask); + } + + // Instrument pclmul intrinsics. + // These intrinsics operate either on odd or on even elements of the input + // vectors, depending on the constant in the 3rd argument, ignoring the rest. + // Replace the unused elements with copies of the used ones, ex: + // (0, 1, 2, 3) -> (0, 0, 2, 2) (even case) + // or + // (0, 1, 2, 3) -> (1, 1, 3, 3) (odd case) + // and then apply the usual shadow combining logic. + void handlePclmulIntrinsic(IntrinsicInst &I) { + IRBuilder<> IRB(&I); + Type *ShadowTy = getShadowTy(&I); + unsigned Width = I.getArgOperand(0)->getType()->getVectorNumElements(); + assert(isa<ConstantInt>(I.getArgOperand(2)) && + "pclmul 3rd operand must be a constant"); + unsigned Imm = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); + Value *Shuf0 = + IRB.CreateShuffleVector(getShadow(&I, 0), UndefValue::get(ShadowTy), + getPclmulMask(IRB, Width, Imm & 0x01)); + Value *Shuf1 = + IRB.CreateShuffleVector(getShadow(&I, 1), UndefValue::get(ShadowTy), + getPclmulMask(IRB, Width, Imm & 0x10)); + ShadowAndOriginCombiner SOC(this, IRB); + SOC.Add(Shuf0, getOrigin(&I, 0)); + SOC.Add(Shuf1, getOrigin(&I, 1)); + SOC.Done(&I); + } + void visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::lifetime_start: handleLifetimeStart(I); break; + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + handleInvariantGroup(I); + break; case Intrinsic::bswap: handleBswap(I); break; @@ -3217,6 +3275,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { handleBmiIntrinsic(I); break; + case Intrinsic::x86_pclmulqdq: + case Intrinsic::x86_pclmulqdq_256: + case Intrinsic::x86_pclmulqdq_512: + handlePclmulIntrinsic(I); + break; + case Intrinsic::is_constant: // The result of llvm.is.constant() is always defined. setShadow(&I, getCleanShadow(&I)); @@ -3258,7 +3322,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // Clear out readonly/readnone attributes. AttrBuilder B; B.addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::ReadNone); + .addAttribute(Attribute::ReadNone) + .addAttribute(Attribute::WriteOnly) + .addAttribute(Attribute::ArgMemOnly) + .addAttribute(Attribute::Speculatable); Func->removeAttributes(AttributeList::FunctionIndex, B); } @@ -3292,8 +3359,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { "ByVal argument is not a pointer!"); Size = DL.getTypeAllocSize(A->getType()->getPointerElementType()); if (ArgOffset + Size > kParamTLSSize) break; - unsigned ParamAlignment = CS.getParamAlignment(i); - unsigned Alignment = std::min(ParamAlignment, kShadowTLSAlignment); + const MaybeAlign ParamAlignment(CS.getParamAlignment(i)); + MaybeAlign Alignment = llvm::None; + if (ParamAlignment) + Alignment = std::min(*ParamAlignment, kShadowTLSAlignment); Value *AShadowPtr = getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ false) @@ -3306,7 +3375,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { Size = DL.getTypeAllocSize(A->getType()); if (ArgOffset + Size > kParamTLSSize) break; Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase, - kShadowTLSAlignment); + kShadowTLSAlignment.value()); Constant *Cst = dyn_cast<Constant>(ArgShadow); if (Cst && Cst->isNullValue()) ArgIsInitialized = true; } @@ -3332,7 +3401,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRBBefore(&I); // Until we have full dynamic coverage, make sure the retval shadow is 0. Value *Base = getShadowPtrForRetval(&I, IRBBefore); - IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment); + IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, + kShadowTLSAlignment.value()); BasicBlock::iterator NextInsn; if (CS.isCall()) { NextInsn = ++I.getIterator(); @@ -3356,7 +3426,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { IRBuilder<> IRBAfter(&*NextInsn); Value *RetvalShadow = IRBAfter.CreateAlignedLoad( getShadowTy(&I), getShadowPtrForRetval(&I, IRBAfter), - kShadowTLSAlignment, "_msret"); + kShadowTLSAlignment.value(), "_msret"); setShadow(&I, RetvalShadow); if (MS.TrackOrigins) setOrigin(&I, IRBAfter.CreateLoad(MS.OriginTy, @@ -3383,10 +3453,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { if (CheckReturnValue) { insertShadowCheck(RetVal, &I); Value *Shadow = getCleanShadow(RetVal); - IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment.value()); } else { Value *Shadow = getShadow(RetVal); - IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment); + IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment.value()); if (MS.TrackOrigins) IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB)); } @@ -3427,11 +3497,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len}); } else { Value *ShadowBase, *OriginBase; - std::tie(ShadowBase, OriginBase) = - getShadowOriginPtr(&I, IRB, IRB.getInt8Ty(), 1, /*isStore*/ true); + std::tie(ShadowBase, OriginBase) = getShadowOriginPtr( + &I, IRB, IRB.getInt8Ty(), Align::None(), /*isStore*/ true); Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0); - IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment()); + IRB.CreateMemSet(ShadowBase, PoisonValue, Len, + MaybeAlign(I.getAlignment())); } if (PoisonStack && MS.TrackOrigins) { @@ -3627,10 +3698,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { int getNumOutputArgs(InlineAsm *IA, CallBase *CB) { int NumRetOutputs = 0; int NumOutputs = 0; - Type *RetTy = dyn_cast<Value>(CB)->getType(); + Type *RetTy = cast<Value>(CB)->getType(); if (!RetTy->isVoidTy()) { // Register outputs are returned via the CallInst return value. - StructType *ST = dyn_cast_or_null<StructType>(RetTy); + auto *ST = dyn_cast<StructType>(RetTy); if (ST) NumRetOutputs = ST->getNumElements(); else @@ -3667,7 +3738,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> { // corresponding CallInst has nO+nI+1 operands (the last operand is the // function to be called). const DataLayout &DL = F.getParent()->getDataLayout(); - CallBase *CB = dyn_cast<CallBase>(&I); + CallBase *CB = cast<CallBase>(&I); IRBuilder<> IRB(&I); InlineAsm *IA = cast<InlineAsm>(CB->getCalledValue()); int OutputArgs = getNumOutputArgs(IA, CB); @@ -3843,7 +3914,7 @@ struct VarArgAMD64Helper : public VarArgHelper { if (!ShadowBase) continue; Value *Shadow = MSV.getShadow(A); - IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment); + IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment.value()); if (MS.TrackOrigins) { Value *Origin = MSV.getOrigin(A); unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType()); @@ -3884,7 +3955,7 @@ struct VarArgAMD64Helper : public VarArgHelper { IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); @@ -3922,10 +3993,11 @@ struct VarArgAMD64Helper : public VarArgHelper { IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset), VAArgOverflowSize); VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); if (MS.TrackOrigins) { VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSOriginCopy, 8, MS.VAArgOriginTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS, + Align(8), CopySize); } } @@ -3944,7 +4016,7 @@ struct VarArgAMD64Helper : public VarArgHelper { Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; - unsigned Alignment = 16; + const Align Alignment = Align(16); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); @@ -4012,7 +4084,8 @@ struct VarArgMIPS64Helper : public VarArgHelper { VAArgOffset = alignTo(VAArgOffset, 8); if (!Base) continue; - IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + IRB.CreateAlignedStore(MSV.getShadow(A), Base, + kShadowTLSAlignment.value()); } Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset); @@ -4038,7 +4111,7 @@ struct VarArgMIPS64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4050,7 +4123,7 @@ struct VarArgMIPS64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4069,7 +4142,7 @@ struct VarArgMIPS64Helper : public VarArgHelper { // If there is a va_start in this function, make a backup copy of // va_arg_tls somewhere in the function entry block. VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); } // Instrument va_start. @@ -4085,7 +4158,7 @@ struct VarArgMIPS64Helper : public VarArgHelper { Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); @@ -4183,7 +4256,8 @@ struct VarArgAArch64Helper : public VarArgHelper { continue; if (!Base) continue; - IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + IRB.CreateAlignedStore(MSV.getShadow(A), Base, + kShadowTLSAlignment.value()); } Constant *OverflowSize = ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset); @@ -4207,7 +4281,7 @@ struct VarArgAArch64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4219,7 +4293,7 @@ struct VarArgAArch64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4260,7 +4334,7 @@ struct VarArgAArch64Helper : public VarArgHelper { IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset), VAArgOverflowSize); VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); } Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize); @@ -4311,14 +4385,15 @@ struct VarArgAArch64Helper : public VarArgHelper { Value *GrRegSaveAreaShadowPtr = MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(), - /*Alignment*/ 8, /*isStore*/ true) + Align(8), /*isStore*/ true) .first; Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, GrRegSaveAreaShadowPtrOff); Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff); - IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, 8, GrSrcPtr, 8, GrCopySize); + IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, Align(8), GrSrcPtr, Align(8), + GrCopySize); // Again, but for FP/SIMD values. Value *VrRegSaveAreaShadowPtrOff = @@ -4326,7 +4401,7 @@ struct VarArgAArch64Helper : public VarArgHelper { Value *VrRegSaveAreaShadowPtr = MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(), - /*Alignment*/ 8, /*isStore*/ true) + Align(8), /*isStore*/ true) .first; Value *VrSrcPtr = IRB.CreateInBoundsGEP( @@ -4336,20 +4411,21 @@ struct VarArgAArch64Helper : public VarArgHelper { VrRegSaveAreaShadowPtrOff); Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff); - IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, 8, VrSrcPtr, 8, VrCopySize); + IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, Align(8), VrSrcPtr, Align(8), + VrCopySize); // And finally for remaining arguments. Value *StackSaveAreaShadowPtr = MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(), - /*Alignment*/ 16, /*isStore*/ true) + Align(16), /*isStore*/ true) .first; Value *StackSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy, IRB.getInt32(AArch64VAEndOffset)); - IRB.CreateMemCpy(StackSaveAreaShadowPtr, 16, StackSrcPtr, 16, - VAArgOverflowSize); + IRB.CreateMemCpy(StackSaveAreaShadowPtr, Align(16), StackSrcPtr, + Align(16), VAArgOverflowSize); } } }; @@ -4441,7 +4517,8 @@ struct VarArgPowerPC64Helper : public VarArgHelper { Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset - VAArgBase, ArgSize); if (Base) - IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment); + IRB.CreateAlignedStore(MSV.getShadow(A), Base, + kShadowTLSAlignment.value()); } VAArgOffset += ArgSize; VAArgOffset = alignTo(VAArgOffset, 8); @@ -4474,7 +4551,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper { VAStartInstrumentationList.push_back(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()), @@ -4485,7 +4562,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper { IRBuilder<> IRB(&I); Value *VAListTag = I.getArgOperand(0); Value *ShadowPtr, *OriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr( VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); // Unpoison the whole __va_list_tag. @@ -4506,7 +4583,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper { // If there is a va_start in this function, make a backup copy of // va_arg_tls somewhere in the function entry block. VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize); - IRB.CreateMemCpy(VAArgTLSCopy, 8, MS.VAArgTLS, 8, CopySize); + IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize); } // Instrument va_start. @@ -4522,7 +4599,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper { Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr); Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr; - unsigned Alignment = 8; + const Align Alignment = Align(8); std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) = MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true); @@ -4567,14 +4644,18 @@ static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan, } bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) { - if (!CompileKernel && (&F == MsanCtorFunction)) + if (!CompileKernel && F.getName() == kMsanModuleCtorName) return false; + MemorySanitizerVisitor Visitor(F, *this, TLI); // Clear out readonly/readnone attributes. AttrBuilder B; B.addAttribute(Attribute::ReadOnly) - .addAttribute(Attribute::ReadNone); + .addAttribute(Attribute::ReadNone) + .addAttribute(Attribute::WriteOnly) + .addAttribute(Attribute::ArgMemOnly) + .addAttribute(Attribute::Speculatable); F.removeAttributes(AttributeList::FunctionIndex, B); return Visitor.runOnFunction(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 6fec3c9c79ee..cc96bdd1d516 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -47,7 +47,9 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" #include "CFGMST.h" +#include "ValueProfileCollector.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -61,7 +63,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/IndirectCallVisitor.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -92,10 +93,12 @@ #include "llvm/IR/ProfileSummary.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CRC.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/DOTGraphTraits.h" @@ -103,11 +106,10 @@ #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GraphWriter.h" -#include "llvm/Support/JamCRC.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Instrumentation.h" -#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/MisExpect.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -120,6 +122,7 @@ using namespace llvm; using ProfileCount = Function::ProfileCount; +using VPCandidateInfo = ValueProfileCollector::CandidateInfo; #define DEBUG_TYPE "pgo-instrumentation" @@ -286,6 +289,11 @@ static std::string getBranchCondString(Instruction *TI) { return result; } +static const char *ValueProfKindDescr[] = { +#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, +#include "llvm/ProfileData/InstrProfData.inc" +}; + namespace { /// The select instruction visitor plays three roles specified @@ -348,50 +356,6 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> { unsigned getNumOfSelectInsts() const { return NSIs; } }; -/// Instruction Visitor class to visit memory intrinsic calls. -struct MemIntrinsicVisitor : public InstVisitor<MemIntrinsicVisitor> { - Function &F; - unsigned NMemIs = 0; // Number of memIntrinsics instrumented. - VisitMode Mode = VM_counting; // Visiting mode. - unsigned CurCtrId = 0; // Current counter index. - unsigned TotalNumCtrs = 0; // Total number of counters - GlobalVariable *FuncNameVar = nullptr; - uint64_t FuncHash = 0; - PGOUseFunc *UseFunc = nullptr; - std::vector<Instruction *> Candidates; - - MemIntrinsicVisitor(Function &Func) : F(Func) {} - - void countMemIntrinsics(Function &Func) { - NMemIs = 0; - Mode = VM_counting; - visit(Func); - } - - void instrumentMemIntrinsics(Function &Func, unsigned TotalNC, - GlobalVariable *FNV, uint64_t FHash) { - Mode = VM_instrument; - TotalNumCtrs = TotalNC; - FuncHash = FHash; - FuncNameVar = FNV; - visit(Func); - } - - std::vector<Instruction *> findMemIntrinsics(Function &Func) { - Candidates.clear(); - Mode = VM_annotate; - visit(Func); - return Candidates; - } - - // Visit the IR stream and annotate all mem intrinsic call instructions. - void instrumentOneMemIntrinsic(MemIntrinsic &MI); - - // Visit \p MI instruction and perform tasks according to visit mode. - void visitMemIntrinsic(MemIntrinsic &SI); - - unsigned getNumOfMemIntrinsics() const { return NMemIs; } -}; class PGOInstrumentationGenLegacyPass : public ModulePass { public: @@ -563,13 +527,14 @@ private: // A map that stores the Comdat group in function F. std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers; + ValueProfileCollector VPC; + void computeCFGHash(); void renameComdatFunction(); public: - std::vector<std::vector<Instruction *>> ValueSites; + std::vector<std::vector<VPCandidateInfo>> ValueSites; SelectInstVisitor SIVisitor; - MemIntrinsicVisitor MIVisitor; std::string FuncName; GlobalVariable *FuncNameVar; @@ -604,23 +569,21 @@ public: std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers, bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr, bool IsCS = false) - : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), - ValueSites(IPVK_Last + 1), SIVisitor(Func), MIVisitor(Func), - MST(F, BPI, BFI) { + : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func), + ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) { // This should be done before CFG hash computation. SIVisitor.countSelects(Func); - MIVisitor.countMemIntrinsics(Func); + ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize); if (!IsCS) { NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); - NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics(); + NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); NumOfPGOBB += MST.BBInfos.size(); - ValueSites[IPVK_IndirectCallTarget] = findIndirectCalls(Func); + ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget); } else { NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); - NumOfCSPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics(); + NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); NumOfCSPGOBB += MST.BBInfos.size(); } - ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func); FuncName = getPGOFuncName(F); computeCFGHash(); @@ -647,7 +610,7 @@ public: // value of each BB in the CFG. The higher 32 bits record the number of edges. template <class Edge, class BBInfo> void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { - std::vector<char> Indexes; + std::vector<uint8_t> Indexes; JamCRC JC; for (auto &BB : F) { const Instruction *TI = BB.getTerminator(); @@ -658,7 +621,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() { continue; uint32_t Index = BI->Index; for (int J = 0; J < 4; J++) - Indexes.push_back((char)(Index >> (J * 8))); + Indexes.push_back((uint8_t)(Index >> (J * 8))); } } JC.update(Indexes); @@ -874,28 +837,36 @@ static void instrumentOneFunc( if (DisableValueProfiling) return; - unsigned NumIndirectCalls = 0; - for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) { - CallSite CS(I); - Value *Callee = CS.getCalledValue(); - LLVM_DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = " - << NumIndirectCalls << "\n"); - IRBuilder<> Builder(I); - assert(Builder.GetInsertPoint() != I->getParent()->end() && - "Cannot get the Instrumentation point"); - Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), - Builder.getInt64(FuncInfo.FunctionHash), - Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()), - Builder.getInt32(IPVK_IndirectCallTarget), - Builder.getInt32(NumIndirectCalls++)}); - } - NumOfPGOICall += NumIndirectCalls; + NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size(); + + // For each VP Kind, walk the VP candidates and instrument each one. + for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) { + unsigned SiteIndex = 0; + if (Kind == IPVK_MemOPSize && !PGOInstrMemOP) + continue; - // Now instrument memop intrinsic calls. - FuncInfo.MIVisitor.instrumentMemIntrinsics( - F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash); + for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) { + LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind] + << " site: CallSite Index = " << SiteIndex << "\n"); + + IRBuilder<> Builder(Cand.InsertPt); + assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() && + "Cannot get the Instrumentation point"); + + Value *ToProfile = nullptr; + if (Cand.V->getType()->isIntegerTy()) + ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty()); + else if (Cand.V->getType()->isPointerTy()) + ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); + assert(ToProfile && "value profiling Value is of unexpected type"); + + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), + {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), + Builder.getInt64(FuncInfo.FunctionHash), ToProfile, + Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}); + } + } // IPVK_First <= Kind <= IPVK_Last } namespace { @@ -984,9 +955,9 @@ class PGOUseFunc { public: PGOUseFunc(Function &Func, Module *Modu, std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers, - BranchProbabilityInfo *BPI = nullptr, - BlockFrequencyInfo *BFIin = nullptr, bool IsCS = false) - : F(Func), M(Modu), BFI(BFIin), + BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin, + ProfileSummaryInfo *PSI, bool IsCS) + : F(Func), M(Modu), BFI(BFIin), PSI(PSI), FuncInfo(Func, ComdatMembers, false, BPI, BFIin, IsCS), FreqAttr(FFA_Normal), IsCS(IsCS) {} @@ -1041,6 +1012,7 @@ private: Function &F; Module *M; BlockFrequencyInfo *BFI; + ProfileSummaryInfo *PSI; // This member stores the shared information with class PGOGenFunc. FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo; @@ -1078,15 +1050,9 @@ private: // FIXME: This function should be removed once the functionality in // the inliner is implemented. void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) { - if (ProgramMaxCount == 0) - return; - // Threshold of the hot functions. - const BranchProbability HotFunctionThreshold(1, 100); - // Threshold of the cold functions. - const BranchProbability ColdFunctionThreshold(2, 10000); - if (EntryCount >= HotFunctionThreshold.scale(ProgramMaxCount)) + if (PSI->isHotCount(EntryCount)) FreqAttr = FFA_Hot; - else if (MaxCount <= ColdFunctionThreshold.scale(ProgramMaxCount)) + else if (PSI->isColdCount(MaxCount)) FreqAttr = FFA_Cold; } }; @@ -1433,43 +1399,6 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) { llvm_unreachable("Unknown visiting mode"); } -void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) { - Module *M = F.getParent(); - IRBuilder<> Builder(&MI); - Type *Int64Ty = Builder.getInt64Ty(); - Type *I8PtrTy = Builder.getInt8PtrTy(); - Value *Length = MI.getLength(); - assert(!isa<ConstantInt>(Length)); - Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy), - Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty), - Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)}); - ++CurCtrId; -} - -void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) { - if (!PGOInstrMemOP) - return; - Value *Length = MI.getLength(); - // Not instrument constant length calls. - if (dyn_cast<ConstantInt>(Length)) - return; - - switch (Mode) { - case VM_counting: - NMemIs++; - return; - case VM_instrument: - instrumentOneMemIntrinsic(MI); - return; - case VM_annotate: - Candidates.push_back(&MI); - return; - } - llvm_unreachable("Unknown visiting mode"); -} - // Traverse all valuesites and annotate the instructions for all value kind. void PGOUseFunc::annotateValueSites() { if (DisableValueProfiling) @@ -1482,11 +1411,6 @@ void PGOUseFunc::annotateValueSites() { annotateValueSites(Kind); } -static const char *ValueProfKindDescr[] = { -#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, -#include "llvm/ProfileData/InstrProfData.inc" -}; - // Annotate the instructions for a specific value kind. void PGOUseFunc::annotateValueSites(uint32_t Kind) { assert(Kind <= IPVK_Last); @@ -1505,11 +1429,11 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) { return; } - for (auto &I : ValueSites) { + for (VPCandidateInfo &I : ValueSites) { LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind << "): Index = " << ValueSiteIndex << " out of " << NumValueSites << "\n"); - annotateValueSite(*M, *I, ProfileRecord, + annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord, static_cast<InstrProfValueKind>(Kind), ValueSiteIndex, Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations : MaxNumAnnotations); @@ -1595,7 +1519,8 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M, static bool annotateAllFunctions( Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName, function_ref<BranchProbabilityInfo *(Function &)> LookupBPI, - function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) { + function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, + ProfileSummaryInfo *PSI, bool IsCS) { LLVM_DEBUG(dbgs() << "Read in profile counters: "); auto &Ctx = M.getContext(); // Read the counter array from file. @@ -1626,6 +1551,13 @@ static bool annotateAllFunctions( return false; } + // Add the profile summary (read from the header of the indexed summary) here + // so that we can use it below when reading counters (which checks if the + // function should be marked with a cold or inlinehint attribute). + M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()), + IsCS ? ProfileSummary::PSK_CSInstr + : ProfileSummary::PSK_Instr); + std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers; collectComdatMembers(M, ComdatMembers); std::vector<Function *> HotFunctions; @@ -1638,7 +1570,7 @@ static bool annotateAllFunctions( // Split indirectbr critical edges here before computing the MST rather than // later in getInstrBB() to avoid invalidating it. SplitIndirectBrCriticalEdges(F, BPI, BFI); - PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, IsCS); + PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, PSI, IsCS); bool AllZeros = false; if (!Func.readCounters(PGOReader.get(), AllZeros)) continue; @@ -1662,9 +1594,9 @@ static bool annotateAllFunctions( F.getName().equals(ViewBlockFreqFuncName))) { LoopInfo LI{DominatorTree(F)}; std::unique_ptr<BranchProbabilityInfo> NewBPI = - llvm::make_unique<BranchProbabilityInfo>(F, LI); + std::make_unique<BranchProbabilityInfo>(F, LI); std::unique_ptr<BlockFrequencyInfo> NewBFI = - llvm::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI); + std::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI); if (PGOViewCounts == PGOVCT_Graph) NewBFI->view(); else if (PGOViewCounts == PGOVCT_Text) { @@ -1686,9 +1618,6 @@ static bool annotateAllFunctions( } } } - M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()), - IsCS ? ProfileSummary::PSK_CSInstr - : ProfileSummary::PSK_Instr); // Set function hotness attribute from the profile. // We have to apply these attributes at the end because their presence @@ -1730,8 +1659,10 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M, return &FAM.getResult<BlockFrequencyAnalysis>(F); }; + auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); + if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName, - LookupBPI, LookupBFI, IsCS)) + LookupBPI, LookupBFI, PSI, IsCS)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); @@ -1748,7 +1679,8 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) { return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI(); }; - return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, + auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); + return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, PSI, IsCS); } @@ -1776,6 +1708,9 @@ void llvm::setProfMetadata(Module *M, Instruction *TI, : Weights) { dbgs() << W << " "; } dbgs() << "\n";); + + misexpect::verifyMisExpect(TI, Weights, TI->getContext()); + TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights)); if (EmitBranchProbability) { std::string BrCondStr = getBranchCondString(TI); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp index 188f95b4676b..d0afe2959b39 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" @@ -138,7 +139,7 @@ public: OptimizationRemarkEmitter &ORE, DominatorTree *DT) : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) { ValueDataArray = - llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); + std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2); // Get the MemOPSize range information from option MemOPSizeRange, getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart, PreciseRangeLast); @@ -374,8 +375,8 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) { Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB); Instruction *NewInst = MI->clone(); // Fix the argument. - MemIntrinsic * MemI = dyn_cast<MemIntrinsic>(NewInst); - IntegerType *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType()); + auto *MemI = cast<MemIntrinsic>(NewInst); + auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType()); assert(SizeType && "Expected integer type size argument."); ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId); MemI->setLength(CaseSizeId); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp index 81d92e724c7d..71ecfd9a2642 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp @@ -65,10 +65,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index ca0cb4bdbe84..e6dc684c2e77 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/EHPersonalities.h" @@ -31,6 +32,7 @@ #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -176,24 +178,21 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) { return Options; } -class SanitizerCoverageModule : public ModulePass { +using DomTreeCallback = function_ref<const DominatorTree *(Function &F)>; +using PostDomTreeCallback = + function_ref<const PostDominatorTree *(Function &F)>; + +class ModuleSanitizerCoverage { public: - SanitizerCoverageModule( + ModuleSanitizerCoverage( const SanitizerCoverageOptions &Options = SanitizerCoverageOptions()) - : ModulePass(ID), Options(OverrideFromCL(Options)) { - initializeSanitizerCoverageModulePass(*PassRegistry::getPassRegistry()); - } - bool runOnModule(Module &M) override; - bool runOnFunction(Function &F); - static char ID; // Pass identification, replacement for typeid - StringRef getPassName() const override { return "SanitizerCoverageModule"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<PostDominatorTreeWrapperPass>(); - } + : Options(OverrideFromCL(Options)) {} + bool instrumentModule(Module &M, DomTreeCallback DTCallback, + PostDomTreeCallback PDTCallback); private: + void instrumentFunction(Function &F, DomTreeCallback DTCallback, + PostDomTreeCallback PDTCallback); void InjectCoverageForIndirectCalls(Function &F, ArrayRef<Instruction *> IndirCalls); void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets); @@ -252,10 +251,57 @@ private: SanitizerCoverageOptions Options; }; +class ModuleSanitizerCoverageLegacyPass : public ModulePass { +public: + ModuleSanitizerCoverageLegacyPass( + const SanitizerCoverageOptions &Options = SanitizerCoverageOptions()) + : ModulePass(ID), Options(Options) { + initializeModuleSanitizerCoverageLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + bool runOnModule(Module &M) override { + ModuleSanitizerCoverage ModuleSancov(Options); + auto DTCallback = [this](Function &F) -> const DominatorTree * { + return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); + }; + auto PDTCallback = [this](Function &F) -> const PostDominatorTree * { + return &this->getAnalysis<PostDominatorTreeWrapperPass>(F) + .getPostDomTree(); + }; + return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback); + } + + static char ID; // Pass identification, replacement for typeid + StringRef getPassName() const override { return "ModuleSanitizerCoverage"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<PostDominatorTreeWrapperPass>(); + } + +private: + SanitizerCoverageOptions Options; +}; + } // namespace +PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M, + ModuleAnalysisManager &MAM) { + ModuleSanitizerCoverage ModuleSancov(Options); + auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); + auto DTCallback = [&FAM](Function &F) -> const DominatorTree * { + return &FAM.getResult<DominatorTreeAnalysis>(F); + }; + auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * { + return &FAM.getResult<PostDominatorTreeAnalysis>(F); + }; + if (ModuleSancov.instrumentModule(M, DTCallback, PDTCallback)) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + std::pair<Value *, Value *> -SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section, +ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section, Type *Ty) { GlobalVariable *SecStart = new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage, nullptr, @@ -278,7 +324,7 @@ SanitizerCoverageModule::CreateSecStartEnd(Module &M, const char *Section, return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEndPtr); } -Function *SanitizerCoverageModule::CreateInitCallsForSections( +Function *ModuleSanitizerCoverage::CreateInitCallsForSections( Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty, const char *Section) { auto SecStartEnd = CreateSecStartEnd(M, Section, Ty); @@ -310,7 +356,8 @@ Function *SanitizerCoverageModule::CreateInitCallsForSections( return CtorFunc; } -bool SanitizerCoverageModule::runOnModule(Module &M) { +bool ModuleSanitizerCoverage::instrumentModule( + Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) { if (Options.CoverageType == SanitizerCoverageOptions::SCK_None) return false; C = &(M.getContext()); @@ -403,7 +450,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) { M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy); for (auto &F : M) - runOnFunction(F); + instrumentFunction(F, DTCallback, PDTCallback); Function *Ctor = nullptr; @@ -518,29 +565,30 @@ static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT, return true; } -bool SanitizerCoverageModule::runOnFunction(Function &F) { +void ModuleSanitizerCoverage::instrumentFunction( + Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) { if (F.empty()) - return false; + return; if (F.getName().find(".module_ctor") != std::string::npos) - return false; // Should not instrument sanitizer init functions. + return; // Should not instrument sanitizer init functions. if (F.getName().startswith("__sanitizer_")) - return false; // Don't instrument __sanitizer_* callbacks. + return; // Don't instrument __sanitizer_* callbacks. // Don't touch available_externally functions, their actual body is elewhere. if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) - return false; + return; // Don't instrument MSVC CRT configuration helpers. They may run before normal // initialization. if (F.getName() == "__local_stdio_printf_options" || F.getName() == "__local_stdio_scanf_options") - return false; + return; if (isa<UnreachableInst>(F.getEntryBlock().getTerminator())) - return false; + return; // Don't instrument functions using SEH for now. Splitting basic blocks like // we do for coverage breaks WinEHPrepare. // FIXME: Remove this when SEH no longer uses landingpad pattern matching. if (F.hasPersonalityFn() && isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) - return false; + return; if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge) SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests()); SmallVector<Instruction *, 8> IndirCalls; @@ -550,10 +598,8 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { SmallVector<BinaryOperator *, 8> DivTraceTargets; SmallVector<GetElementPtrInst *, 8> GepTraceTargets; - const DominatorTree *DT = - &getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); - const PostDominatorTree *PDT = - &getAnalysis<PostDominatorTreeWrapperPass>(F).getPostDomTree(); + const DominatorTree *DT = DTCallback(F); + const PostDominatorTree *PDT = PDTCallback(F); bool IsLeafFunc = true; for (auto &BB : F) { @@ -593,10 +639,9 @@ bool SanitizerCoverageModule::runOnFunction(Function &F) { InjectTraceForSwitch(F, SwitchTraceTargets); InjectTraceForDiv(F, DivTraceTargets); InjectTraceForGep(F, GepTraceTargets); - return true; } -GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection( +GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection( size_t NumElements, Function &F, Type *Ty, const char *Section) { ArrayType *ArrayTy = ArrayType::get(Ty, NumElements); auto Array = new GlobalVariable( @@ -608,8 +653,9 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection( GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId)) Array->setComdat(Comdat); Array->setSection(getSectionName(Section)); - Array->setAlignment(Ty->isPointerTy() ? DL->getPointerSize() - : Ty->getPrimitiveSizeInBits() / 8); + Array->setAlignment(Align(Ty->isPointerTy() + ? DL->getPointerSize() + : Ty->getPrimitiveSizeInBits() / 8)); GlobalsToAppendToUsed.push_back(Array); GlobalsToAppendToCompilerUsed.push_back(Array); MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F)); @@ -619,7 +665,7 @@ GlobalVariable *SanitizerCoverageModule::CreateFunctionLocalArrayInSection( } GlobalVariable * -SanitizerCoverageModule::CreatePCArray(Function &F, +ModuleSanitizerCoverage::CreatePCArray(Function &F, ArrayRef<BasicBlock *> AllBlocks) { size_t N = AllBlocks.size(); assert(N); @@ -646,7 +692,7 @@ SanitizerCoverageModule::CreatePCArray(Function &F, return PCArray; } -void SanitizerCoverageModule::CreateFunctionLocalArrays( +void ModuleSanitizerCoverage::CreateFunctionLocalArrays( Function &F, ArrayRef<BasicBlock *> AllBlocks) { if (Options.TracePCGuard) FunctionGuardArray = CreateFunctionLocalArrayInSection( @@ -660,7 +706,7 @@ void SanitizerCoverageModule::CreateFunctionLocalArrays( FunctionPCsArray = CreatePCArray(F, AllBlocks); } -bool SanitizerCoverageModule::InjectCoverage(Function &F, +bool ModuleSanitizerCoverage::InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks, bool IsLeafFunc) { if (AllBlocks.empty()) return false; @@ -677,7 +723,7 @@ bool SanitizerCoverageModule::InjectCoverage(Function &F, // The cache is used to speed up recording the caller-callee pairs. // The address of the caller is passed implicitly via caller PC. // CacheSize is encoded in the name of the run-time function. -void SanitizerCoverageModule::InjectCoverageForIndirectCalls( +void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls( Function &F, ArrayRef<Instruction *> IndirCalls) { if (IndirCalls.empty()) return; @@ -696,7 +742,7 @@ void SanitizerCoverageModule::InjectCoverageForIndirectCalls( // __sanitizer_cov_trace_switch(CondValue, // {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... }) -void SanitizerCoverageModule::InjectTraceForSwitch( +void ModuleSanitizerCoverage::InjectTraceForSwitch( Function &, ArrayRef<Instruction *> SwitchTraceTargets) { for (auto I : SwitchTraceTargets) { if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { @@ -735,7 +781,7 @@ void SanitizerCoverageModule::InjectTraceForSwitch( } } -void SanitizerCoverageModule::InjectTraceForDiv( +void ModuleSanitizerCoverage::InjectTraceForDiv( Function &, ArrayRef<BinaryOperator *> DivTraceTargets) { for (auto BO : DivTraceTargets) { IRBuilder<> IRB(BO); @@ -753,7 +799,7 @@ void SanitizerCoverageModule::InjectTraceForDiv( } } -void SanitizerCoverageModule::InjectTraceForGep( +void ModuleSanitizerCoverage::InjectTraceForGep( Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) { for (auto GEP : GepTraceTargets) { IRBuilder<> IRB(GEP); @@ -764,7 +810,7 @@ void SanitizerCoverageModule::InjectTraceForGep( } } -void SanitizerCoverageModule::InjectTraceForCmp( +void ModuleSanitizerCoverage::InjectTraceForCmp( Function &, ArrayRef<Instruction *> CmpTraceTargets) { for (auto I : CmpTraceTargets) { if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) { @@ -799,7 +845,7 @@ void SanitizerCoverageModule::InjectTraceForCmp( } } -void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, +void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx, bool IsLeafFunc) { BasicBlock::iterator IP = BB.getFirstInsertionPt(); @@ -842,8 +888,10 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, } if (Options.StackDepth && IsEntryBB && !IsLeafFunc) { // Check stack depth. If it's the deepest so far, record it. - Function *GetFrameAddr = - Intrinsic::getDeclaration(F.getParent(), Intrinsic::frameaddress); + Module *M = F.getParent(); + Function *GetFrameAddr = Intrinsic::getDeclaration( + M, Intrinsic::frameaddress, + IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace())); auto FrameAddrPtr = IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)}); auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy); @@ -858,7 +906,7 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB, } std::string -SanitizerCoverageModule::getSectionName(const std::string &Section) const { +ModuleSanitizerCoverage::getSectionName(const std::string &Section) const { if (TargetTriple.isOSBinFormatCOFF()) { if (Section == SanCovCountersSectionName) return ".SCOV$CM"; @@ -872,32 +920,29 @@ SanitizerCoverageModule::getSectionName(const std::string &Section) const { } std::string -SanitizerCoverageModule::getSectionStart(const std::string &Section) const { +ModuleSanitizerCoverage::getSectionStart(const std::string &Section) const { if (TargetTriple.isOSBinFormatMachO()) return "\1section$start$__DATA$__" + Section; return "__start___" + Section; } std::string -SanitizerCoverageModule::getSectionEnd(const std::string &Section) const { +ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const { if (TargetTriple.isOSBinFormatMachO()) return "\1section$end$__DATA$__" + Section; return "__stop___" + Section; } - -char SanitizerCoverageModule::ID = 0; -INITIALIZE_PASS_BEGIN(SanitizerCoverageModule, "sancov", - "SanitizerCoverage: TODO." - "ModulePass", - false, false) +char ModuleSanitizerCoverageLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov", + "Pass for instrumenting coverage on functions", false, + false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(SanitizerCoverageModule, "sancov", - "SanitizerCoverage: TODO." - "ModulePass", - false, false) -ModulePass *llvm::createSanitizerCoverageModulePass( +INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov", + "Pass for instrumenting coverage on functions", false, + false) +ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass( const SanitizerCoverageOptions &Options) { - return new SanitizerCoverageModule(Options); + return new ModuleSanitizerCoverageLegacyPass(Options); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 5be13fa745cb..9b7edad3444b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -26,7 +26,6 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -37,6 +36,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -45,6 +45,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/ModuleUtils.h" using namespace llvm; @@ -92,11 +93,10 @@ namespace { /// ensures the __tsan_init function is in the list of global constructors for /// the module. struct ThreadSanitizer { - ThreadSanitizer(Module &M); bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI); private: - void initializeCallbacks(Module &M); + void initialize(Module &M); bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL); bool instrumentAtomic(Instruction *I, const DataLayout &DL); bool instrumentMemIntrinsic(Instruction *I); @@ -108,8 +108,6 @@ private: void InsertRuntimeIgnores(Function &F); Type *IntptrTy; - IntegerType *OrdTy; - // Callbacks to run-time library are computed in doInitialization. FunctionCallee TsanFuncEntry; FunctionCallee TsanFuncExit; FunctionCallee TsanIgnoreBegin; @@ -130,7 +128,6 @@ private: FunctionCallee TsanVptrUpdate; FunctionCallee TsanVptrLoad; FunctionCallee MemmoveFn, MemcpyFn, MemsetFn; - Function *TsanCtorFunction; }; struct ThreadSanitizerLegacyPass : FunctionPass { @@ -143,16 +140,32 @@ struct ThreadSanitizerLegacyPass : FunctionPass { private: Optional<ThreadSanitizer> TSan; }; + +void insertModuleCtor(Module &M) { + getOrCreateSanitizerCtorAndInitFunctions( + M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{}, + /*InitArgs=*/{}, + // This callback is invoked when the functions are created the first + // time. Hook them into the global ctors list in that case: + [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); }); +} + } // namespace PreservedAnalyses ThreadSanitizerPass::run(Function &F, FunctionAnalysisManager &FAM) { - ThreadSanitizer TSan(*F.getParent()); + ThreadSanitizer TSan; if (TSan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F))) return PreservedAnalyses::none(); return PreservedAnalyses::all(); } +PreservedAnalyses ThreadSanitizerPass::run(Module &M, + ModuleAnalysisManager &MAM) { + insertModuleCtor(M); + return PreservedAnalyses::none(); +} + char ThreadSanitizerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan", "ThreadSanitizer: detects data races.", false, false) @@ -169,12 +182,13 @@ void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { } bool ThreadSanitizerLegacyPass::doInitialization(Module &M) { - TSan.emplace(M); + insertModuleCtor(M); + TSan.emplace(); return true; } bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) { - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); TSan->sanitizeFunction(F, TLI); return true; } @@ -183,7 +197,10 @@ FunctionPass *llvm::createThreadSanitizerLegacyPassPass() { return new ThreadSanitizerLegacyPass(); } -void ThreadSanitizer::initializeCallbacks(Module &M) { +void ThreadSanitizer::initialize(Module &M) { + const DataLayout &DL = M.getDataLayout(); + IntptrTy = DL.getIntPtrType(M.getContext()); + IRBuilder<> IRB(M.getContext()); AttributeList Attr; Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex, @@ -197,7 +214,7 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { IRB.getVoidTy()); TsanIgnoreEnd = M.getOrInsertFunction("__tsan_ignore_thread_end", Attr, IRB.getVoidTy()); - OrdTy = IRB.getInt32Ty(); + IntegerType *OrdTy = IRB.getInt32Ty(); for (size_t i = 0; i < kNumberOfAccessSizes; ++i) { const unsigned ByteSize = 1U << i; const unsigned BitSize = ByteSize * 8; @@ -280,20 +297,6 @@ void ThreadSanitizer::initializeCallbacks(Module &M) { IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy); } -ThreadSanitizer::ThreadSanitizer(Module &M) { - const DataLayout &DL = M.getDataLayout(); - IntptrTy = DL.getIntPtrType(M.getContext()); - std::tie(TsanCtorFunction, std::ignore) = - getOrCreateSanitizerCtorAndInitFunctions( - M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{}, - /*InitArgs=*/{}, - // This callback is invoked when the functions are created the first - // time. Hook them into the global ctors list in that case: - [&](Function *Ctor, FunctionCallee) { - appendToGlobalCtors(M, Ctor, 0); - }); -} - static bool isVtableAccess(Instruction *I) { if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) return Tag->isTBAAVtableAccess(); @@ -436,9 +439,9 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, const TargetLibraryInfo &TLI) { // This is required to prevent instrumenting call to __tsan_init from within // the module constructor. - if (&F == TsanCtorFunction) + if (F.getName() == kTsanModuleCtorName) return false; - initializeCallbacks(*F.getParent()); + initialize(*F.getParent()); SmallVector<Instruction*, 8> AllLoadsAndStores; SmallVector<Instruction*, 8> LocalLoadsAndStores; SmallVector<Instruction*, 8> AtomicAccesses; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp new file mode 100644 index 000000000000..604726d4f40f --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp @@ -0,0 +1,78 @@ +//===- ValueProfileCollector.cpp - determine what to value profile --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl +// +//===----------------------------------------------------------------------===// + +#include "ValueProfilePlugins.inc" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" + +#include <cassert> + +using namespace llvm; + +namespace { + +/// A plugin-based class that takes an arbitrary number of Plugin types. +/// Each plugin type must satisfy the following API: +/// 1) the constructor must take a `Function &f`. Typically, the plugin would +/// scan the function looking for candidates. +/// 2) contain a member function with the following signature and name: +/// void run(std::vector<CandidateInfo> &Candidates); +/// such that the plugin would append its result into the vector parameter. +/// +/// Plugins are defined in ValueProfilePlugins.inc +template <class... Ts> class PluginChain; + +/// The type PluginChainFinal is the final chain of plugins that will be used by +/// ValueProfileCollectorImpl. +using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>; + +template <> class PluginChain<> { +public: + PluginChain(Function &F) {} + void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {} +}; + +template <class PluginT, class... Ts> +class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> { + PluginT Plugin; + using Base = PluginChain<Ts...>; + +public: + PluginChain(Function &F) : PluginChain<Ts...>(F), Plugin(F) {} + + void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) { + if (K == PluginT::Kind) + Plugin.run(Candidates); + Base::get(K, Candidates); + } +}; + +} // end anonymous namespace + +/// ValueProfileCollectorImpl inherits the API of PluginChainFinal. +class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal { +public: + using PluginChainFinal::PluginChainFinal; +}; + +ValueProfileCollector::ValueProfileCollector(Function &F) + : PImpl(new ValueProfileCollectorImpl(F)) {} + +ValueProfileCollector::~ValueProfileCollector() = default; + +std::vector<CandidateInfo> +ValueProfileCollector::get(InstrProfValueKind Kind) const { + std::vector<CandidateInfo> Result; + PImpl->get(Kind, Result); + return Result; +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h new file mode 100644 index 000000000000..ff883c8d0c77 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h @@ -0,0 +1,79 @@ +//===- ValueProfileCollector.h - determine what to value profile ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a utility class, ValueProfileCollector, that is used to +// determine what kind of llvm::Value's are worth value-profiling, at which +// point in the program, and which instruction holds the Value Profile metadata. +// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use] +// passes. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H +#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProf.h" + +namespace llvm { + +/// Utility analysis that determines what values are worth profiling. +/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to +/// populate the Candidates vector. +/// +/// Value profiling an expression means to track the values that this expression +/// takes at runtime and the frequency of each value. +/// It is important to distinguish between two sets of value profiles for a +/// particular expression: +/// 1) The set of values at the point of evaluation. +/// 2) The set of values at the point of use. +/// In some cases, the two sets are identical, but it's not unusual for the two +/// to differ. +/// +/// To elaborate more, consider this C code, and focus on the expression `nn`: +/// void foo(int nn, bool b) { +/// if (b) memcpy(x, y, nn); +/// } +/// The point of evaluation can be as early as the start of the function, and +/// let's say the value profile for `nn` is: +/// total=100; (value,freq) set = {(8,10), (32,50)} +/// The point of use is right before we call memcpy, and since we execute the +/// memcpy conditionally, the value profile of `nn` can be: +/// total=15; (value,freq) set = {(8,10), (4,5)} +/// +/// For this reason, a plugin is responsible for computing the insertion point +/// for each value to be profiled. The `CandidateInfo` structure encapsulates +/// all the information needed for each value profile site. +class ValueProfileCollector { +public: + struct CandidateInfo { + Value *V; // The value to profile. + Instruction *InsertPt; // Insert the VP lib call before this instr. + Instruction *AnnotatedInst; // Where metadata is attached. + }; + + ValueProfileCollector(Function &Fn); + ValueProfileCollector(ValueProfileCollector &&) = delete; + ValueProfileCollector &operator=(ValueProfileCollector &&) = delete; + + ValueProfileCollector(const ValueProfileCollector &) = delete; + ValueProfileCollector &operator=(const ValueProfileCollector &) = delete; + ~ValueProfileCollector(); + + /// returns a list of value profiling candidates of the given kind + std::vector<CandidateInfo> get(InstrProfValueKind Kind) const; + +private: + class ValueProfileCollectorImpl; + std::unique_ptr<ValueProfileCollectorImpl> PImpl; +}; + +} // namespace llvm + +#endif diff --git a/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc new file mode 100644 index 000000000000..4cc4c6c848c3 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc @@ -0,0 +1,75 @@ +//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a set of plugin classes used in ValueProfileCollectorImpl. +// Each plugin is responsible for collecting Value Profiling candidates for a +// particular optimization. +// Each plugin must satisfy the interface described in ValueProfileCollector.cpp +// +//===----------------------------------------------------------------------===// + +#include "ValueProfileCollector.h" +#include "llvm/Analysis/IndirectCallVisitor.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; +using CandidateInfo = ValueProfileCollector::CandidateInfo; + +///--------------------------- MemIntrinsicPlugin ------------------------------ +class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> { + Function &F; + std::vector<CandidateInfo> *Candidates; + +public: + static constexpr InstrProfValueKind Kind = IPVK_MemOPSize; + + MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {} + + void run(std::vector<CandidateInfo> &Cs) { + Candidates = &Cs; + visit(F); + Candidates = nullptr; + } + void visitMemIntrinsic(MemIntrinsic &MI) { + Value *Length = MI.getLength(); + // Not instrument constant length calls. + if (dyn_cast<ConstantInt>(Length)) + return; + + Instruction *InsertPt = &MI; + Instruction *AnnotatedInst = &MI; + Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); + } +}; + +///------------------------ IndirectCallPromotionPlugin ------------------------ +class IndirectCallPromotionPlugin { + Function &F; + +public: + static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget; + + IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {} + + void run(std::vector<CandidateInfo> &Candidates) { + std::vector<Instruction *> Result = findIndirectCalls(F); + for (Instruction *I : Result) { + Value *Callee = CallSite(I).getCalledValue(); + Instruction *InsertPt = I; + Instruction *AnnotatedInst = I; + Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst}); + } + } +}; + +///----------------------- Registration of the plugins ------------------------- +/// For now, registering a plugin with the ValueProfileCollector is done by +/// adding the plugin type to the VP_PLUGIN_LIST macro. +#define VP_PLUGIN_LIST \ + MemIntrinsicPlugin, \ + IndirectCallPromotionPlugin diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp index b341dd807508..7a01ec967fb5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp @@ -26,6 +26,7 @@ #include "ObjCARC.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Constants.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index 36aa513ec554..ecf8220ae95d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -34,6 +34,8 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp index 04e98d8f5577..205d8ddf151d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/PassAnalysisSupport.h" #include "llvm/PassRegistry.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp index 6653ff0bb91a..b80c1675050b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp @@ -58,8 +58,10 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -506,6 +508,20 @@ namespace { ARCInstKind &Class); void OptimizeIndividualCalls(Function &F); + /// Optimize an individual call, optionally passing the + /// GetArgRCIdentityRoot if it has already been computed. + void OptimizeIndividualCallImpl( + Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, + Instruction *Inst, ARCInstKind Class, const Value *Arg); + + /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV. If the + /// optimization occurs, returns true to indicate that the caller should + /// assume the instructions are dead. + bool OptimizeInlinedAutoreleaseRVCall( + Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, + Instruction *Inst, const Value *&Arg, ARCInstKind Class, + Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg); + void CheckForCFGHazards(const BasicBlock *BB, DenseMap<const BasicBlock *, BBState> &BBStates, BBState &MyStates) const; @@ -589,8 +605,7 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const { } /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is -/// not a return value. Or, if it can be paired with an -/// objc_autoreleaseReturnValue, delete the pair and return true. +/// not a return value. bool ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Check for the argument being from an immediately preceding call or invoke. @@ -616,39 +631,6 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { } } - // Track PHIs which are equivalent to our Arg. - SmallDenseSet<const Value*, 2> EquivalentArgs; - EquivalentArgs.insert(Arg); - - // Add PHIs that are equivalent to Arg to ArgUsers. - if (const PHINode *PN = dyn_cast<PHINode>(Arg)) { - SmallVector<const Value *, 2> ArgUsers; - getEquivalentPHIs(*PN, ArgUsers); - EquivalentArgs.insert(ArgUsers.begin(), ArgUsers.end()); - } - - // Check for being preceded by an objc_autoreleaseReturnValue on the same - // pointer. In this case, we can delete the pair. - BasicBlock::iterator I = RetainRV->getIterator(), - Begin = RetainRV->getParent()->begin(); - if (I != Begin) { - do - --I; - while (I != Begin && IsNoopInstruction(&*I)); - if (GetBasicARCInstKind(&*I) == ARCInstKind::AutoreleaseRV && - EquivalentArgs.count(GetArgRCIdentityRoot(&*I))) { - Changed = true; - ++NumPeeps; - - LLVM_DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n" - << "Erasing " << *RetainRV << "\n"); - - EraseInstruction(&*I); - EraseInstruction(RetainRV); - return true; - } - } - // Turn it to a plain objc_retain. Changed = true; ++NumPeeps; @@ -666,6 +648,62 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { return false; } +bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall( + Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, + Instruction *Inst, const Value *&Arg, ARCInstKind Class, + Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg) { + // Must be in the same basic block. + assert(Inst->getParent() == AutoreleaseRV->getParent()); + + // Must operate on the same root. + Arg = GetArgRCIdentityRoot(Inst); + AutoreleaseRVArg = GetArgRCIdentityRoot(AutoreleaseRV); + if (Arg != AutoreleaseRVArg) { + // If there isn't an exact match, check if we have equivalent PHIs. + const PHINode *PN = dyn_cast<PHINode>(Arg); + if (!PN) + return false; + + SmallVector<const Value *, 4> ArgUsers; + getEquivalentPHIs(*PN, ArgUsers); + if (llvm::find(ArgUsers, AutoreleaseRVArg) == ArgUsers.end()) + return false; + } + + // Okay, this is a match. Merge them. + ++NumPeeps; + LLVM_DEBUG(dbgs() << "Found inlined objc_autoreleaseReturnValue '" + << *AutoreleaseRV << "' paired with '" << *Inst << "'\n"); + + // Delete the RV pair, starting with the AutoreleaseRV. + AutoreleaseRV->replaceAllUsesWith( + cast<CallInst>(AutoreleaseRV)->getArgOperand(0)); + EraseInstruction(AutoreleaseRV); + if (Class == ARCInstKind::RetainRV) { + // AutoreleaseRV and RetainRV cancel out. Delete the RetainRV. + Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0)); + EraseInstruction(Inst); + return true; + } + + // ClaimRV is a frontend peephole for RetainRV + Release. Since the + // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release. + assert(Class == ARCInstKind::ClaimRV); + Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0); + CallInst *Release = CallInst::Create( + EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst); + assert(IsAlwaysTail(ARCInstKind::ClaimRV) && + "Expected ClaimRV to be safe to tail call"); + Release->setTailCall(); + Inst->replaceAllUsesWith(CallArg); + EraseInstruction(Inst); + + // Run the normal optimizations on Release. + OptimizeIndividualCallImpl(F, BlockColors, Release, ARCInstKind::Release, + Arg); + return true; +} + /// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not /// used as a return value. void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, @@ -752,286 +790,370 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) BlockColors = colorEHFunclets(F); + // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired + // with RetainRV and ClaimRV. + Instruction *DelayedAutoreleaseRV = nullptr; + const Value *DelayedAutoreleaseRVArg = nullptr; + auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) { + assert(!DelayedAutoreleaseRV || !AutoreleaseRV); + DelayedAutoreleaseRV = AutoreleaseRV; + DelayedAutoreleaseRVArg = nullptr; + }; + auto optimizeDelayedAutoreleaseRV = [&]() { + if (!DelayedAutoreleaseRV) + return; + OptimizeIndividualCallImpl(F, BlockColors, DelayedAutoreleaseRV, + ARCInstKind::AutoreleaseRV, + DelayedAutoreleaseRVArg); + setDelayedAutoreleaseRV(nullptr); + }; + auto shouldDelayAutoreleaseRV = [&](Instruction *NonARCInst) { + // Nothing to delay, but we may as well skip the logic below. + if (!DelayedAutoreleaseRV) + return true; + + // If we hit the end of the basic block we're not going to find an RV-pair. + // Stop delaying. + if (NonARCInst->isTerminator()) + return false; + + // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and + // ClaimRV, it's probably safe to skip over even opaque function calls + // here since OptimizeInlinedAutoreleaseRVCall will confirm that they + // have the same RCIdentityRoot. However, what really matters is + // skipping instructions or intrinsics that the inliner could leave behind; + // be conservative for now and don't skip over opaque calls, which could + // potentially include other ARC calls. + auto *CB = dyn_cast<CallBase>(NonARCInst); + if (!CB) + return true; + return CB->getIntrinsicID() != Intrinsic::not_intrinsic; + }; + // Visit all objc_* calls in F. for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) { Instruction *Inst = &*I++; ARCInstKind Class = GetBasicARCInstKind(Inst); - LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n"); - - // Some of the ARC calls can be deleted if their arguments are global - // variables that are inert in ARC. - if (IsNoopOnGlobal(Class)) { - Value *Opnd = Inst->getOperand(0); - if (auto *GV = dyn_cast<GlobalVariable>(Opnd->stripPointerCasts())) - if (GV->hasAttribute("objc_arc_inert")) { - if (!Inst->getType()->isVoidTy()) - Inst->replaceAllUsesWith(Opnd); - Inst->eraseFromParent(); - continue; - } - } - + // Skip this loop if this instruction isn't itself an ARC intrinsic. + const Value *Arg = nullptr; switch (Class) { - default: break; - - // Delete no-op casts. These function calls have special semantics, but - // the semantics are entirely implemented via lowering in the front-end, - // so by the time they reach the optimizer, they are just no-op calls - // which return their argument. - // - // There are gray areas here, as the ability to cast reference-counted - // pointers to raw void* and back allows code to break ARC assumptions, - // however these are currently considered to be unimportant. - case ARCInstKind::NoopCast: - Changed = true; - ++NumNoops; - LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n"); - EraseInstruction(Inst); - continue; - - // If the pointer-to-weak-pointer is null, it's undefined behavior. - case ARCInstKind::StoreWeak: - case ARCInstKind::LoadWeak: - case ARCInstKind::LoadWeakRetained: - case ARCInstKind::InitWeak: - case ARCInstKind::DestroyWeak: { - CallInst *CI = cast<CallInst>(Inst); - if (IsNullOrUndef(CI->getArgOperand(0))) { - Changed = true; - Type *Ty = CI->getArgOperand(0)->getType(); - new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), - Constant::getNullValue(Ty), - CI); - Value *NewValue = UndefValue::get(CI->getType()); - LLVM_DEBUG( - dbgs() << "A null pointer-to-weak-pointer is undefined behavior." - "\nOld = " - << *CI << "\nNew = " << *NewValue << "\n"); - CI->replaceAllUsesWith(NewValue); - CI->eraseFromParent(); - continue; - } - break; - } - case ARCInstKind::CopyWeak: - case ARCInstKind::MoveWeak: { - CallInst *CI = cast<CallInst>(Inst); - if (IsNullOrUndef(CI->getArgOperand(0)) || - IsNullOrUndef(CI->getArgOperand(1))) { - Changed = true; - Type *Ty = CI->getArgOperand(0)->getType(); - new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), - Constant::getNullValue(Ty), - CI); - - Value *NewValue = UndefValue::get(CI->getType()); - LLVM_DEBUG( - dbgs() << "A null pointer-to-weak-pointer is undefined behavior." - "\nOld = " - << *CI << "\nNew = " << *NewValue << "\n"); - - CI->replaceAllUsesWith(NewValue); - CI->eraseFromParent(); - continue; - } - break; - } - case ARCInstKind::RetainRV: - if (OptimizeRetainRVCall(F, Inst)) - continue; + default: + optimizeDelayedAutoreleaseRV(); break; + case ARCInstKind::CallOrUser: + case ARCInstKind::User: + case ARCInstKind::None: + // This is a non-ARC instruction. If we're delaying an AutoreleaseRV, + // check if it's safe to skip over it; if not, optimize the AutoreleaseRV + // now. + if (!shouldDelayAutoreleaseRV(Inst)) + optimizeDelayedAutoreleaseRV(); + continue; case ARCInstKind::AutoreleaseRV: - OptimizeAutoreleaseRVCall(F, Inst, Class); + optimizeDelayedAutoreleaseRV(); + setDelayedAutoreleaseRV(Inst); + continue; + case ARCInstKind::RetainRV: + case ARCInstKind::ClaimRV: + if (DelayedAutoreleaseRV) { + // We have a potential RV pair. Check if they cancel out. + if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class, + DelayedAutoreleaseRV, + DelayedAutoreleaseRVArg)) { + setDelayedAutoreleaseRV(nullptr); + continue; + } + optimizeDelayedAutoreleaseRV(); + } break; } - // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. - if (IsAutorelease(Class) && Inst->use_empty()) { - CallInst *Call = cast<CallInst>(Inst); - const Value *Arg = Call->getArgOperand(0); - Arg = FindSingleUseIdentifiedObject(Arg); - if (Arg) { - Changed = true; - ++NumAutoreleases; - - // Create the declaration lazily. - LLVMContext &C = Inst->getContext(); - - Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release); - CallInst *NewCall = CallInst::Create(Decl, Call->getArgOperand(0), "", - Call); - NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), - MDNode::get(C, None)); - - LLVM_DEBUG( - dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) " - "since x is otherwise unused.\nOld: " - << *Call << "\nNew: " << *NewCall << "\n"); - - EraseInstruction(Call); - Inst = NewCall; - Class = ARCInstKind::Release; + OptimizeIndividualCallImpl(F, BlockColors, Inst, Class, Arg); + } + + // Catch the final delayed AutoreleaseRV. + optimizeDelayedAutoreleaseRV(); +} + +void ObjCARCOpt::OptimizeIndividualCallImpl( + Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors, + Instruction *Inst, ARCInstKind Class, const Value *Arg) { + LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n"); + + // Some of the ARC calls can be deleted if their arguments are global + // variables that are inert in ARC. + if (IsNoopOnGlobal(Class)) { + Value *Opnd = Inst->getOperand(0); + if (auto *GV = dyn_cast<GlobalVariable>(Opnd->stripPointerCasts())) + if (GV->hasAttribute("objc_arc_inert")) { + if (!Inst->getType()->isVoidTy()) + Inst->replaceAllUsesWith(Opnd); + Inst->eraseFromParent(); + return; } - } + } - // For functions which can never be passed stack arguments, add - // a tail keyword. - if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) { + switch (Class) { + default: + break; + + // Delete no-op casts. These function calls have special semantics, but + // the semantics are entirely implemented via lowering in the front-end, + // so by the time they reach the optimizer, they are just no-op calls + // which return their argument. + // + // There are gray areas here, as the ability to cast reference-counted + // pointers to raw void* and back allows code to break ARC assumptions, + // however these are currently considered to be unimportant. + case ARCInstKind::NoopCast: + Changed = true; + ++NumNoops; + LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n"); + EraseInstruction(Inst); + return; + + // If the pointer-to-weak-pointer is null, it's undefined behavior. + case ARCInstKind::StoreWeak: + case ARCInstKind::LoadWeak: + case ARCInstKind::LoadWeakRetained: + case ARCInstKind::InitWeak: + case ARCInstKind::DestroyWeak: { + CallInst *CI = cast<CallInst>(Inst); + if (IsNullOrUndef(CI->getArgOperand(0))) { Changed = true; + Type *Ty = CI->getArgOperand(0)->getType(); + new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), + Constant::getNullValue(Ty), CI); + Value *NewValue = UndefValue::get(CI->getType()); LLVM_DEBUG( - dbgs() << "Adding tail keyword to function since it can never be " - "passed stack args: " - << *Inst << "\n"); - cast<CallInst>(Inst)->setTailCall(); + dbgs() << "A null pointer-to-weak-pointer is undefined behavior." + "\nOld = " + << *CI << "\nNew = " << *NewValue << "\n"); + CI->replaceAllUsesWith(NewValue); + CI->eraseFromParent(); + return; } - - // Ensure that functions that can never have a "tail" keyword due to the - // semantics of ARC truly do not do so. - if (IsNeverTail(Class)) { + break; + } + case ARCInstKind::CopyWeak: + case ARCInstKind::MoveWeak: { + CallInst *CI = cast<CallInst>(Inst); + if (IsNullOrUndef(CI->getArgOperand(0)) || + IsNullOrUndef(CI->getArgOperand(1))) { Changed = true; - LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst - << "\n"); - cast<CallInst>(Inst)->setTailCall(false); + Type *Ty = CI->getArgOperand(0)->getType(); + new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()), + Constant::getNullValue(Ty), CI); + + Value *NewValue = UndefValue::get(CI->getType()); + LLVM_DEBUG( + dbgs() << "A null pointer-to-weak-pointer is undefined behavior." + "\nOld = " + << *CI << "\nNew = " << *NewValue << "\n"); + + CI->replaceAllUsesWith(NewValue); + CI->eraseFromParent(); + return; } + break; + } + case ARCInstKind::RetainRV: + if (OptimizeRetainRVCall(F, Inst)) + return; + break; + case ARCInstKind::AutoreleaseRV: + OptimizeAutoreleaseRVCall(F, Inst, Class); + break; + } - // Set nounwind as needed. - if (IsNoThrow(Class)) { + // objc_autorelease(x) -> objc_release(x) if x is otherwise unused. + if (IsAutorelease(Class) && Inst->use_empty()) { + CallInst *Call = cast<CallInst>(Inst); + const Value *Arg = Call->getArgOperand(0); + Arg = FindSingleUseIdentifiedObject(Arg); + if (Arg) { Changed = true; - LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " - << *Inst << "\n"); - cast<CallInst>(Inst)->setDoesNotThrow(); - } + ++NumAutoreleases; - if (!IsNoopOnNull(Class)) { - UsedInThisFunction |= 1 << unsigned(Class); - continue; - } + // Create the declaration lazily. + LLVMContext &C = Inst->getContext(); - const Value *Arg = GetArgRCIdentityRoot(Inst); + Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release); + CallInst *NewCall = + CallInst::Create(Decl, Call->getArgOperand(0), "", Call); + NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), + MDNode::get(C, None)); - // ARC calls with null are no-ops. Delete them. - if (IsNullOrUndef(Arg)) { - Changed = true; - ++NumNoops; - LLVM_DEBUG(dbgs() << "ARC calls with null are no-ops. Erasing: " << *Inst - << "\n"); - EraseInstruction(Inst); - continue; + LLVM_DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) " + "since x is otherwise unused.\nOld: " + << *Call << "\nNew: " << *NewCall << "\n"); + + EraseInstruction(Call); + Inst = NewCall; + Class = ARCInstKind::Release; } + } + + // For functions which can never be passed stack arguments, add + // a tail keyword. + if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) { + Changed = true; + LLVM_DEBUG( + dbgs() << "Adding tail keyword to function since it can never be " + "passed stack args: " + << *Inst << "\n"); + cast<CallInst>(Inst)->setTailCall(); + } + + // Ensure that functions that can never have a "tail" keyword due to the + // semantics of ARC truly do not do so. + if (IsNeverTail(Class)) { + Changed = true; + LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst + << "\n"); + cast<CallInst>(Inst)->setTailCall(false); + } + + // Set nounwind as needed. + if (IsNoThrow(Class)) { + Changed = true; + LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst + << "\n"); + cast<CallInst>(Inst)->setDoesNotThrow(); + } - // Keep track of which of retain, release, autorelease, and retain_block - // are actually present in this function. + // Note: This catches instructions unrelated to ARC. + if (!IsNoopOnNull(Class)) { UsedInThisFunction |= 1 << unsigned(Class); + return; + } + + // If we haven't already looked up the root, look it up now. + if (!Arg) + Arg = GetArgRCIdentityRoot(Inst); + + // ARC calls with null are no-ops. Delete them. + if (IsNullOrUndef(Arg)) { + Changed = true; + ++NumNoops; + LLVM_DEBUG(dbgs() << "ARC calls with null are no-ops. Erasing: " << *Inst + << "\n"); + EraseInstruction(Inst); + return; + } + + // Keep track of which of retain, release, autorelease, and retain_block + // are actually present in this function. + UsedInThisFunction |= 1 << unsigned(Class); + + // If Arg is a PHI, and one or more incoming values to the + // PHI are null, and the call is control-equivalent to the PHI, and there + // are no relevant side effects between the PHI and the call, and the call + // is not a release that doesn't have the clang.imprecise_release tag, the + // call could be pushed up to just those paths with non-null incoming + // values. For now, don't bother splitting critical edges for this. + if (Class == ARCInstKind::Release && + !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease))) + return; + + SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist; + Worklist.push_back(std::make_pair(Inst, Arg)); + do { + std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val(); + Inst = Pair.first; + Arg = Pair.second; - // If Arg is a PHI, and one or more incoming values to the - // PHI are null, and the call is control-equivalent to the PHI, and there - // are no relevant side effects between the PHI and the call, and the call - // is not a release that doesn't have the clang.imprecise_release tag, the - // call could be pushed up to just those paths with non-null incoming - // values. For now, don't bother splitting critical edges for this. - if (Class == ARCInstKind::Release && - !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease))) + const PHINode *PN = dyn_cast<PHINode>(Arg); + if (!PN) continue; - SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist; - Worklist.push_back(std::make_pair(Inst, Arg)); - do { - std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val(); - Inst = Pair.first; - Arg = Pair.second; - - const PHINode *PN = dyn_cast<PHINode>(Arg); - if (!PN) continue; - - // Determine if the PHI has any null operands, or any incoming - // critical edges. - bool HasNull = false; - bool HasCriticalEdges = false; - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = - GetRCIdentityRoot(PN->getIncomingValue(i)); - if (IsNullOrUndef(Incoming)) - HasNull = true; - else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() != - 1) { - HasCriticalEdges = true; - break; - } + // Determine if the PHI has any null operands, or any incoming + // critical edges. + bool HasNull = false; + bool HasCriticalEdges = false; + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i)); + if (IsNullOrUndef(Incoming)) + HasNull = true; + else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() != + 1) { + HasCriticalEdges = true; + break; } - // If we have null operands and no critical edges, optimize. - if (!HasCriticalEdges && HasNull) { - SmallPtrSet<Instruction *, 4> DependingInstructions; - SmallPtrSet<const BasicBlock *, 4> Visited; - - // Check that there is nothing that cares about the reference - // count between the call and the phi. - switch (Class) { - case ARCInstKind::Retain: - case ARCInstKind::RetainBlock: - // These can always be moved up. - break; - case ARCInstKind::Release: - // These can't be moved across things that care about the retain - // count. - FindDependencies(NeedsPositiveRetainCount, Arg, - Inst->getParent(), Inst, - DependingInstructions, Visited, PA); - break; - case ARCInstKind::Autorelease: - // These can't be moved across autorelease pool scope boundaries. - FindDependencies(AutoreleasePoolBoundary, Arg, - Inst->getParent(), Inst, - DependingInstructions, Visited, PA); - break; - case ARCInstKind::ClaimRV: - case ARCInstKind::RetainRV: - case ARCInstKind::AutoreleaseRV: - // Don't move these; the RV optimization depends on the autoreleaseRV - // being tail called, and the retainRV being immediately after a call - // (which might still happen if we get lucky with codegen layout, but - // it's not worth taking the chance). - continue; - default: - llvm_unreachable("Invalid dependence flavor"); - } + } + // If we have null operands and no critical edges, optimize. + if (HasCriticalEdges) + continue; + if (!HasNull) + continue; - if (DependingInstructions.size() == 1 && - *DependingInstructions.begin() == PN) { - Changed = true; - ++NumPartialNoops; - // Clone the call into each predecessor that has a non-null value. - CallInst *CInst = cast<CallInst>(Inst); - Type *ParamTy = CInst->getArgOperand(0)->getType(); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = - GetRCIdentityRoot(PN->getIncomingValue(i)); - if (!IsNullOrUndef(Incoming)) { - Value *Op = PN->getIncomingValue(i); - Instruction *InsertPos = &PN->getIncomingBlock(i)->back(); - CallInst *Clone = cast<CallInst>(CloneCallInstForBB( - *CInst, *InsertPos->getParent(), BlockColors)); - if (Op->getType() != ParamTy) - Op = new BitCastInst(Op, ParamTy, "", InsertPos); - Clone->setArgOperand(0, Op); - Clone->insertBefore(InsertPos); - - LLVM_DEBUG(dbgs() << "Cloning " << *CInst - << "\n" - "And inserting clone at " - << *InsertPos << "\n"); - Worklist.push_back(std::make_pair(Clone, Incoming)); - } - } - // Erase the original call. - LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n"); - EraseInstruction(CInst); - continue; - } - } - } while (!Worklist.empty()); - } + SmallPtrSet<Instruction *, 4> DependingInstructions; + SmallPtrSet<const BasicBlock *, 4> Visited; + + // Check that there is nothing that cares about the reference + // count between the call and the phi. + switch (Class) { + case ARCInstKind::Retain: + case ARCInstKind::RetainBlock: + // These can always be moved up. + break; + case ARCInstKind::Release: + // These can't be moved across things that care about the retain + // count. + FindDependencies(NeedsPositiveRetainCount, Arg, Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + break; + case ARCInstKind::Autorelease: + // These can't be moved across autorelease pool scope boundaries. + FindDependencies(AutoreleasePoolBoundary, Arg, Inst->getParent(), Inst, + DependingInstructions, Visited, PA); + break; + case ARCInstKind::ClaimRV: + case ARCInstKind::RetainRV: + case ARCInstKind::AutoreleaseRV: + // Don't move these; the RV optimization depends on the autoreleaseRV + // being tail called, and the retainRV being immediately after a call + // (which might still happen if we get lucky with codegen layout, but + // it's not worth taking the chance). + continue; + default: + llvm_unreachable("Invalid dependence flavor"); + } + + if (DependingInstructions.size() != 1) + continue; + if (*DependingInstructions.begin() != PN) + continue; + + Changed = true; + ++NumPartialNoops; + // Clone the call into each predecessor that has a non-null value. + CallInst *CInst = cast<CallInst>(Inst); + Type *ParamTy = CInst->getArgOperand(0)->getType(); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i)); + if (IsNullOrUndef(Incoming)) + continue; + Value *Op = PN->getIncomingValue(i); + Instruction *InsertPos = &PN->getIncomingBlock(i)->back(); + CallInst *Clone = cast<CallInst>( + CloneCallInstForBB(*CInst, *InsertPos->getParent(), BlockColors)); + if (Op->getType() != ParamTy) + Op = new BitCastInst(Op, ParamTy, "", InsertPos); + Clone->setArgOperand(0, Op); + Clone->insertBefore(InsertPos); + + LLVM_DEBUG(dbgs() << "Cloning " << *CInst << "\n" + "And inserting clone at " + << *InsertPos << "\n"); + Worklist.push_back(std::make_pair(Clone, Incoming)); + } + // Erase the original call. + LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n"); + EraseInstruction(CInst); + } while (!Worklist.empty()); } /// If we have a top down pointer in the S_Use state, make sure that there are diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp index b768f7973b87..99a2055aba94 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp @@ -13,6 +13,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp index 3243481dee0d..26dd416d6184 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/ObjCARC/PtrState.cpp @@ -275,6 +275,10 @@ void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst, } else { InsertAfter = std::next(Inst->getIterator()); } + + if (InsertAfter != BB->end()) + InsertAfter = skipDebugIntrinsics(InsertAfter); + InsertReverseInsertPt(&*InsertAfter); }; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp index 7f7460c5746a..cc3d3bf7cdbf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ProfileData/InstrProf.h" #include "llvm/Support/Casting.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index de9a62e88c27..06deaf3c4f9a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -15,6 +15,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" #define AA_NAME "alignment-from-assumptions" #define DEBUG_TYPE AA_NAME #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" @@ -93,9 +94,7 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV, const SCEV *AlignSCEV, ScalarEvolution *SE) { // DiffUnits = Diff % int64_t(Alignment) - const SCEV *DiffAlignDiv = SE->getUDivExpr(DiffSCEV, AlignSCEV); - const SCEV *DiffAlign = SE->getMulExpr(DiffAlignDiv, AlignSCEV); - const SCEV *DiffUnitsSCEV = SE->getMinusSCEV(DiffAlign, DiffSCEV); + const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV); LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is " << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n"); @@ -323,7 +322,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { LI->getPointerOperand(), SE); if (NewAlignment > LI->getAlignment()) { - LI->setAlignment(NewAlignment); + LI->setAlignment(MaybeAlign(NewAlignment)); ++NumLoadAlignChanged; } } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) { @@ -331,7 +330,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { SI->getPointerOperand(), SE); if (NewAlignment > SI->getAlignment()) { - SI->setAlignment(NewAlignment); + SI->setAlignment(MaybeAlign(NewAlignment)); ++NumStoreAlignChanged; } } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp index 9bd387c33e80..0fa38fa80b17 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -19,13 +19,14 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "bdce" @@ -101,7 +102,7 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { (I.getType()->isIntOrIntVectorTy() && DB.getDemandedBits(&I).isNullValue() && wouldInstructionBeTriviallyDead(&I))) { - salvageDebugInfo(I); + salvageDebugInfoOrMarkUndef(I); Worklist.push_back(&I); I.dropAllReferences(); Changed = true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp index 3519b000a33f..e34c011b1c87 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp @@ -59,13 +59,15 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace PatternMatch; @@ -562,7 +564,7 @@ struct CallSiteSplittingLegacyPass : public FunctionPass { if (skipFunction(F)) return false; - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); return doCallSiteSplitting(F, TLI, TTI, DT); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 98243a23f1ef..5bfece010bec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -14,7 +14,7 @@ // cost. If the constant can be folded into the instruction (the cost is // TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't // consider it expensive and leave it alone. This is the default behavior and -// the default implementation of getIntImmCost will always return TCC_Free. +// the default implementation of getIntImmCostInst will always return TCC_Free. // // If the cost is more than TCC_BASIC, then the integer constant can't be folded // into the instruction and it might be beneficial to hoist the constant. @@ -43,7 +43,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -54,6 +53,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/Casting.h" @@ -61,6 +61,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include <algorithm> #include <cassert> @@ -204,7 +205,7 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, /// set found in \p BBs. static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, BasicBlock *Entry, - SmallPtrSet<BasicBlock *, 8> &BBs) { + SetVector<BasicBlock *> &BBs) { assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); // Nodes on the current path to the root. SmallPtrSet<BasicBlock *, 8> Path; @@ -257,7 +258,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, // Visit Orders in bottom-up order. using InsertPtsCostPair = - std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency>; + std::pair<SetVector<BasicBlock *>, BlockFrequency>; // InsertPtsMap is a map from a BB to the best insertion points for the // subtree of BB (subtree not including the BB itself). @@ -266,7 +267,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { BasicBlock *Node = *RIt; bool NodeInBBs = BBs.count(Node); - SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first; + auto &InsertPts = InsertPtsMap[Node].first; BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; // Return the optimal insert points in BBs. @@ -283,7 +284,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock(); // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child // will update its parent's ParentInsertPts and ParentPtsFreq. - SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first; + auto &ParentInsertPts = InsertPtsMap[Parent].first; BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second; // Choose to insert in Node or in subtree of Node. // Don't hoist to EHPad because we may not find a proper place to insert @@ -305,12 +306,12 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, } /// Find an insertion point that dominates all uses. -SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint( +SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint( const ConstantInfo &ConstInfo) const { assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); // Collect all basic blocks. - SmallPtrSet<BasicBlock *, 8> BBs; - SmallPtrSet<Instruction *, 8> InsertPts; + SetVector<BasicBlock *> BBs; + SetVector<Instruction *> InsertPts; for (auto const &RCI : ConstInfo.RebasedConstants) for (auto const &U : RCI.Uses) BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent()); @@ -333,15 +334,13 @@ SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint( while (BBs.size() >= 2) { BasicBlock *BB, *BB1, *BB2; - BB1 = *BBs.begin(); - BB2 = *std::next(BBs.begin()); + BB1 = BBs.pop_back_val(); + BB2 = BBs.pop_back_val(); BB = DT->findNearestCommonDominator(BB1, BB2); if (BB == Entry) { InsertPts.insert(&Entry->front()); return InsertPts; } - BBs.erase(BB1); - BBs.erase(BB2); BBs.insert(BB); } assert((BBs.size() == 1) && "Expected only one element."); @@ -363,11 +362,11 @@ void ConstantHoistingPass::collectConstantCandidates( // Ask the target about the cost of materializing the constant for the given // instruction and operand index. if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst)) - Cost = TTI->getIntImmCost(IntrInst->getIntrinsicID(), Idx, - ConstInt->getValue(), ConstInt->getType()); + Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx, + ConstInt->getValue(), ConstInt->getType()); else - Cost = TTI->getIntImmCost(Inst->getOpcode(), Idx, ConstInt->getValue(), - ConstInt->getType()); + Cost = TTI->getIntImmCostInst(Inst->getOpcode(), Idx, ConstInt->getValue(), + ConstInt->getType()); // Ignore cheap integer constants. if (Cost > TargetTransformInfo::TCC_Basic) { @@ -403,7 +402,7 @@ void ConstantHoistingPass::collectConstantCandidates( return; // Get offset from the base GV. - PointerType *GVPtrTy = dyn_cast<PointerType>(BaseGV->getType()); + PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType()); IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace()); APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true); auto *GEPO = cast<GEPOperator>(ConstExpr); @@ -417,7 +416,7 @@ void ConstantHoistingPass::collectConstantCandidates( // usually lowered to a load from constant pool. Such operation is unlikely // to be cheaper than compute it by <Base + Offset>, which can be lowered to // an ADD instruction or folded into Load/Store instruction. - int Cost = TTI->getIntImmCost(Instruction::Add, 1, Offset, PtrIntTy); + int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy); ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV]; ConstCandMapType::iterator Itr; bool Inserted; @@ -488,9 +487,10 @@ void ConstantHoistingPass::collectConstantCandidates( // Scan all operands. for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { // The cost of materializing the constants (defined in - // `TargetTransformInfo::getIntImmCost`) for instructions which only take - // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So - // it's safe for us to collect constant candidates from all IntrinsicInsts. + // `TargetTransformInfo::getIntImmCostInst`) for instructions which only + // take constant variables is lower than `TargetTransformInfo::TCC_Basic`. + // So it's safe for us to collect constant candidates from all + // IntrinsicInsts. if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) { collectConstantCandidates(ConstCandMap, Inst, Idx); } @@ -501,9 +501,13 @@ void ConstantHoistingPass::collectConstantCandidates( /// into an instruction itself. void ConstantHoistingPass::collectConstantCandidates(Function &Fn) { ConstCandMapType ConstCandMap; - for (BasicBlock &BB : Fn) + for (BasicBlock &BB : Fn) { + // Ignore unreachable basic blocks. + if (!DT->isReachableFromEntry(&BB)) + continue; for (Instruction &Inst : BB) collectConstantCandidates(ConstCandMap, &Inst); + } } // This helper function is necessary to deal with values that have different @@ -554,7 +558,8 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, unsigned NumUses = 0; bool OptForSize = Entry->getParent()->hasOptSize() || - llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI); + llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI, + PGSOQueryType::IRPass); if (!OptForSize || std::distance(S,E) > 100) { for (auto ConstCand = S; ConstCand != E; ++ConstCand) { NumUses += ConstCand->Uses.size(); @@ -577,7 +582,7 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S, for (auto User : ConstCand->Uses) { unsigned Opcode = User.Inst->getOpcode(); unsigned OpndIdx = User.OpndIdx; - Cost += TTI->getIntImmCost(Opcode, OpndIdx, Value, Ty); + Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty); LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n"); for (auto C2 = S; C2 != E; ++C2) { @@ -830,7 +835,7 @@ bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) { SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec = BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec; for (auto const &ConstInfo : ConstInfoVec) { - SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo); + SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo); // We can have an empty set if the function contains unreachable blocks. if (IPSet.empty()) continue; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp index 770321c740a0..73bf1d521b1d 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/Constant.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" @@ -82,7 +83,7 @@ bool ConstantPropagation::runOnFunction(Function &F) { bool Changed = false; const DataLayout &DL = F.getParent()->getDataLayout(); TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); while (!WorkList.empty()) { SmallVector<Instruction*, 16> NewWorkListVec; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 89497177524f..3435bc7f5eaa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -37,6 +37,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -62,6 +63,23 @@ STATISTIC(NumSDivs, "Number of sdiv converted to udiv"); STATISTIC(NumUDivs, "Number of udivs whose width was decreased"); STATISTIC(NumAShrs, "Number of ashr converted to lshr"); STATISTIC(NumSRems, "Number of srem converted to urem"); +STATISTIC(NumSExt, "Number of sext converted to zext"); +STATISTIC(NumAnd, "Number of ands removed"); +STATISTIC(NumNW, "Number of no-wrap deductions"); +STATISTIC(NumNSW, "Number of no-signed-wrap deductions"); +STATISTIC(NumNUW, "Number of no-unsigned-wrap deductions"); +STATISTIC(NumAddNW, "Number of no-wrap deductions for add"); +STATISTIC(NumAddNSW, "Number of no-signed-wrap deductions for add"); +STATISTIC(NumAddNUW, "Number of no-unsigned-wrap deductions for add"); +STATISTIC(NumSubNW, "Number of no-wrap deductions for sub"); +STATISTIC(NumSubNSW, "Number of no-signed-wrap deductions for sub"); +STATISTIC(NumSubNUW, "Number of no-unsigned-wrap deductions for sub"); +STATISTIC(NumMulNW, "Number of no-wrap deductions for mul"); +STATISTIC(NumMulNSW, "Number of no-signed-wrap deductions for mul"); +STATISTIC(NumMulNUW, "Number of no-unsigned-wrap deductions for mul"); +STATISTIC(NumShlNW, "Number of no-wrap deductions for shl"); +STATISTIC(NumShlNSW, "Number of no-signed-wrap deductions for shl"); +STATISTIC(NumShlNUW, "Number of no-unsigned-wrap deductions for shl"); STATISTIC(NumOverflows, "Number of overflow checks removed"); STATISTIC(NumSaturating, "Number of saturating arithmetics converted to normal arithmetics"); @@ -85,6 +103,7 @@ namespace { AU.addRequired<LazyValueInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<LazyValueInfoWrapperPass>(); } }; @@ -176,7 +195,14 @@ static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI, } // All constant incoming values map to the same variable along the incoming - // edges of the phi. The phi is unnecessary. + // edges of the phi. The phi is unnecessary. However, we must drop all + // poison-generating flags to ensure that no poison is propagated to the phi + // location by performing this substitution. + // Warning: If the underlying analysis changes, this may not be enough to + // guarantee that poison is not propagated. + // TODO: We may be able to re-infer flags by re-analyzing the instruction. + if (auto *CommonInst = dyn_cast<Instruction>(CommonValue)) + CommonInst->dropPoisonGeneratingFlags(); P->replaceAllUsesWith(CommonValue); P->eraseFromParent(); ++NumPhiCommon; @@ -416,37 +442,96 @@ static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) { return NWRegion.contains(LRange); } -static void processOverflowIntrinsic(WithOverflowInst *WO) { - IRBuilder<> B(WO); - Value *NewOp = B.CreateBinOp( - WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), WO->getName()); - // Constant-folding could have happened. - if (auto *Inst = dyn_cast<Instruction>(NewOp)) { - if (WO->isSigned()) +static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode, + bool NewNSW, bool NewNUW) { + Statistic *OpcNW, *OpcNSW, *OpcNUW; + switch (Opcode) { + case Instruction::Add: + OpcNW = &NumAddNW; + OpcNSW = &NumAddNSW; + OpcNUW = &NumAddNUW; + break; + case Instruction::Sub: + OpcNW = &NumSubNW; + OpcNSW = &NumSubNSW; + OpcNUW = &NumSubNUW; + break; + case Instruction::Mul: + OpcNW = &NumMulNW; + OpcNSW = &NumMulNSW; + OpcNUW = &NumMulNUW; + break; + case Instruction::Shl: + OpcNW = &NumShlNW; + OpcNSW = &NumShlNSW; + OpcNUW = &NumShlNUW; + break; + default: + llvm_unreachable("Will not be called with other binops"); + } + + auto *Inst = dyn_cast<Instruction>(V); + if (NewNSW) { + ++NumNW; + ++*OpcNW; + ++NumNSW; + ++*OpcNSW; + if (Inst) Inst->setHasNoSignedWrap(); - else + } + if (NewNUW) { + ++NumNW; + ++*OpcNW; + ++NumNUW; + ++*OpcNUW; + if (Inst) Inst->setHasNoUnsignedWrap(); } +} + +static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI); - Value *NewI = B.CreateInsertValue(UndefValue::get(WO->getType()), NewOp, 0); - NewI = B.CreateInsertValue(NewI, ConstantInt::getFalse(WO->getContext()), 1); +// Rewrite this with.overflow intrinsic as non-overflowing. +static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) { + IRBuilder<> B(WO); + Instruction::BinaryOps Opcode = WO->getBinaryOp(); + bool NSW = WO->isSigned(); + bool NUW = !WO->isSigned(); + + Value *NewOp = + B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName()); + setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW); + + StructType *ST = cast<StructType>(WO->getType()); + Constant *Struct = ConstantStruct::get(ST, + { UndefValue::get(ST->getElementType(0)), + ConstantInt::getFalse(ST->getElementType(1)) }); + Value *NewI = B.CreateInsertValue(Struct, NewOp, 0); WO->replaceAllUsesWith(NewI); WO->eraseFromParent(); ++NumOverflows; + + // See if we can infer the other no-wrap too. + if (auto *BO = dyn_cast<BinaryOperator>(NewOp)) + processBinOp(BO, LVI); } -static void processSaturatingInst(SaturatingInst *SI) { +static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) { + Instruction::BinaryOps Opcode = SI->getBinaryOp(); + bool NSW = SI->isSigned(); + bool NUW = !SI->isSigned(); BinaryOperator *BinOp = BinaryOperator::Create( - SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI); + Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI); BinOp->setDebugLoc(SI->getDebugLoc()); - if (SI->isSigned()) - BinOp->setHasNoSignedWrap(); - else - BinOp->setHasNoUnsignedWrap(); + setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW); SI->replaceAllUsesWith(BinOp); SI->eraseFromParent(); ++NumSaturating; + + // See if we can infer the other no-wrap too. + if (auto *BO = dyn_cast<BinaryOperator>(BinOp)) + processBinOp(BO, LVI); } /// Infer nonnull attributes for the arguments at the specified callsite. @@ -456,14 +541,14 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { if (auto *WO = dyn_cast<WithOverflowInst>(CS.getInstruction())) { if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) { - processOverflowIntrinsic(WO); + processOverflowIntrinsic(WO, LVI); return true; } } if (auto *SI = dyn_cast<SaturatingInst>(CS.getInstruction())) { if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) { - processSaturatingInst(SI); + processSaturatingInst(SI, LVI); return true; } } @@ -632,6 +717,27 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) { return true; } +static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) { + if (SDI->getType()->isVectorTy()) + return false; + + Value *Base = SDI->getOperand(0); + + Constant *Zero = ConstantInt::get(Base->getType(), 0); + if (LVI->getPredicateAt(ICmpInst::ICMP_SGE, Base, Zero, SDI) != + LazyValueInfo::True) + return false; + + ++NumSExt; + auto *ZExt = + CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI); + ZExt->setDebugLoc(SDI->getDebugLoc()); + SDI->replaceAllUsesWith(ZExt); + SDI->eraseFromParent(); + + return true; +} + static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { using OBO = OverflowingBinaryOperator; @@ -648,6 +754,7 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { BasicBlock *BB = BinOp->getParent(); + Instruction::BinaryOps Opcode = BinOp->getOpcode(); Value *LHS = BinOp->getOperand(0); Value *RHS = BinOp->getOperand(1); @@ -655,24 +762,48 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) { ConstantRange RRange = LVI->getConstantRange(RHS, BB, BinOp); bool Changed = false; + bool NewNUW = false, NewNSW = false; if (!NUW) { ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion( - BinOp->getOpcode(), RRange, OBO::NoUnsignedWrap); - bool NewNUW = NUWRange.contains(LRange); - BinOp->setHasNoUnsignedWrap(NewNUW); + Opcode, RRange, OBO::NoUnsignedWrap); + NewNUW = NUWRange.contains(LRange); Changed |= NewNUW; } if (!NSW) { ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion( - BinOp->getOpcode(), RRange, OBO::NoSignedWrap); - bool NewNSW = NSWRange.contains(LRange); - BinOp->setHasNoSignedWrap(NewNSW); + Opcode, RRange, OBO::NoSignedWrap); + NewNSW = NSWRange.contains(LRange); Changed |= NewNSW; } + setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW); + return Changed; } +static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) { + if (BinOp->getType()->isVectorTy()) + return false; + + // Pattern match (and lhs, C) where C includes a superset of bits which might + // be set in lhs. This is a common truncation idiom created by instcombine. + BasicBlock *BB = BinOp->getParent(); + Value *LHS = BinOp->getOperand(0); + ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1)); + if (!RHS || !RHS->getValue().isMask()) + return false; + + ConstantRange LRange = LVI->getConstantRange(LHS, BB, BinOp); + if (!LRange.getUnsignedMax().ule(RHS->getValue())) + return false; + + BinOp->replaceAllUsesWith(LHS); + BinOp->eraseFromParent(); + NumAnd++; + return true; +} + + static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { if (Constant *C = LVI->getConstant(V, At->getParent(), At)) return C; @@ -740,10 +871,18 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT, case Instruction::AShr: BBChanged |= processAShr(cast<BinaryOperator>(II), LVI); break; + case Instruction::SExt: + BBChanged |= processSExt(cast<SExtInst>(II), LVI); + break; case Instruction::Add: case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI); break; + case Instruction::And: + BBChanged |= processAnd(cast<BinaryOperator>(II), LVI); + break; } } @@ -796,5 +935,6 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { PreservedAnalyses PA; PA.preserve<GlobalsAA>(); PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<LazyValueAnalysis>(); return PA; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp index 479e0ed74074..a4b0c8df98f6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DCE.cpp @@ -19,12 +19,14 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; #define DEBUG_TYPE "dce" @@ -38,17 +40,19 @@ namespace { //===--------------------------------------------------------------------===// // DeadInstElimination pass implementation // - struct DeadInstElimination : public BasicBlockPass { - static char ID; // Pass identification, replacement for typeid - DeadInstElimination() : BasicBlockPass(ID) { - initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); - } - bool runOnBasicBlock(BasicBlock &BB) override { - if (skipBasicBlock(BB)) - return false; - auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; - bool Changed = false; +struct DeadInstElimination : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + DeadInstElimination() : FunctionPass(ID) { + initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; + + bool Changed = false; + for (auto &BB : F) { for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { Instruction *Inst = &*DI++; if (isInstructionTriviallyDead(Inst, TLI)) { @@ -60,13 +64,14 @@ namespace { ++DIEEliminated; } } - return Changed; } + return Changed; + } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); } - }; +}; } char DeadInstElimination::ID = 0; @@ -77,6 +82,43 @@ Pass *llvm::createDeadInstEliminationPass() { return new DeadInstElimination(); } +//===--------------------------------------------------------------------===// +// RedundantDbgInstElimination pass implementation +// + +namespace { +struct RedundantDbgInstElimination : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + RedundantDbgInstElimination() : FunctionPass(ID) { + initializeRedundantDbgInstEliminationPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + bool Changed = false; + for (auto &BB : F) + Changed |= RemoveRedundantDbgInstrs(&BB); + return Changed; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } +}; +} + +char RedundantDbgInstElimination::ID = 0; +INITIALIZE_PASS(RedundantDbgInstElimination, "redundant-dbg-inst-elim", + "Redundant Dbg Instruction Elimination", false, false) + +Pass *llvm::createRedundantDbgInstEliminationPass() { + return new RedundantDbgInstElimination(); +} + +//===--------------------------------------------------------------------===// +// DeadCodeElimination pass implementation +// + static bool DCEInstruction(Instruction *I, SmallSetVector<Instruction *, 16> &WorkList, const TargetLibraryInfo *TLI) { @@ -154,7 +196,7 @@ struct DCELegacyPass : public FunctionPass { return false; auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI() : nullptr; + TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; return eliminateDeadCode(F, TLI); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index a81645745b48..1ba4aab999e1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -17,6 +17,7 @@ #include "llvm/Transforms/Scalar/DeadStoreElimination.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -48,6 +49,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -99,6 +101,7 @@ static void deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, MemoryDependenceResults &MD, const TargetLibraryInfo &TLI, InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst, SmallSetVector<const Value *, 16> *ValueSet = nullptr) { SmallVector<Instruction*, 32> NowDeadInsts; @@ -112,6 +115,10 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, // Before we touch this instruction, remove it from memdep! do { Instruction *DeadInst = NowDeadInsts.pop_back_val(); + // Mark the DeadInst as dead in the list of throwable instructions. + auto It = ThrowableInst.find(DeadInst); + if (It != ThrowableInst.end()) + ThrowableInst[It->first] = false; ++NumFastOther; // Try to preserve debug information attached to the dead instruction. @@ -144,6 +151,9 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI, DeadInst->eraseFromParent(); } while (!NowDeadInsts.empty()); *BBI = NewIter; + // Pop dead entries from back of ThrowableInst till we find an alive entry. + while (!ThrowableInst.empty() && !ThrowableInst.back().second) + ThrowableInst.pop_back(); } /// Does this instruction write some memory? This only returns true for things @@ -169,15 +179,18 @@ static bool hasAnalyzableMemoryWrite(Instruction *I, } if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { - StringRef FnName = F->getName(); - if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy)) - return true; - if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy)) - return true; - if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat)) - return true; - if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat)) - return true; + LibFunc LF; + if (TLI.getLibFunc(*F, LF) && TLI.has(LF)) { + switch (LF) { + case LibFunc_strcpy: + case LibFunc_strncpy: + case LibFunc_strcat: + case LibFunc_strncat: + return true; + default: + return false; + } + } } } return false; @@ -656,7 +669,8 @@ static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks, static bool handleFree(CallInst *F, AliasAnalysis *AA, MemoryDependenceResults *MD, DominatorTree *DT, const TargetLibraryInfo *TLI, - InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB) { + InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { bool MadeChange = false; MemoryLocation Loc = MemoryLocation(F->getOperand(0)); @@ -690,7 +704,8 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA, // DCE instructions only used to calculate that store. BasicBlock::iterator BBI(Dependency); - deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); ++NumFastStores; MadeChange = true; @@ -747,8 +762,8 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc, static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, MemoryDependenceResults *MD, const TargetLibraryInfo *TLI, - InstOverlapIntervalsTy &IOL, - OrderedBasicBlock &OBB) { + InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { bool MadeChange = false; // Keep track of all of the stack objects that are dead at the end of the @@ -809,7 +824,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, << '\n'); // DCE instructions only used to calculate that store. - deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB, + deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst, &DeadStackObjects); ++NumFastStores; MadeChange = true; @@ -821,7 +836,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA, if (isInstructionTriviallyDead(&*BBI, TLI)) { LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: " << *&*BBI << '\n'); - deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB, + deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst, &DeadStackObjects); ++NumFastOther; MadeChange = true; @@ -1028,7 +1043,8 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, const DataLayout &DL, const TargetLibraryInfo *TLI, InstOverlapIntervalsTy &IOL, - OrderedBasicBlock &OBB) { + OrderedBasicBlock &OBB, + MapVector<Instruction *, bool> &ThrowableInst) { // Must be a store instruction. StoreInst *SI = dyn_cast<StoreInst>(Inst); if (!SI) @@ -1044,7 +1060,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); - deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst); ++NumRedundantStores; return true; } @@ -1062,7 +1078,7 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI, dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: " << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n'); - deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst); ++NumRedundantStores; return true; } @@ -1077,7 +1093,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, bool MadeChange = false; OrderedBasicBlock OBB(&BB); - Instruction *LastThrowing = nullptr; + MapVector<Instruction *, bool> ThrowableInst; // A map of interval maps representing partially-overwritten value parts. InstOverlapIntervalsTy IOL; @@ -1086,7 +1102,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { // Handle 'free' calls specially. if (CallInst *F = isFreeCall(&*BBI, TLI)) { - MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB); + MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB, ThrowableInst); // Increment BBI after handleFree has potentially deleted instructions. // This ensures we maintain a valid iterator. ++BBI; @@ -1096,7 +1112,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, Instruction *Inst = &*BBI++; if (Inst->mayThrow()) { - LastThrowing = Inst; + ThrowableInst[Inst] = true; continue; } @@ -1105,7 +1121,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, continue; // eliminateNoopStore will update in iterator, if necessary. - if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB)) { + if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB, + ThrowableInst)) { MadeChange = true; continue; } @@ -1148,6 +1165,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, if (!DepLoc.Ptr) break; + // Find the last throwable instruction not removed by call to + // deleteDeadInstruction. + Instruction *LastThrowing = nullptr; + if (!ThrowableInst.empty()) + LastThrowing = ThrowableInst.back().first; + // Make sure we don't look past a call which might throw. This is an // issue because MemoryDependenceAnalysis works in the wrong direction: // it finds instructions which dominate the current instruction, rather than @@ -1187,7 +1210,8 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, << "\n KILLER: " << *Inst << '\n'); // Delete the store and now-dead instructions that feed it. - deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); ++NumFastStores; MadeChange = true; @@ -1254,8 +1278,9 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, auto *SI = new StoreInst( ConstantInt::get(Earlier->getValueOperand()->getType(), Merged), - Earlier->getPointerOperand(), false, Earlier->getAlignment(), - Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite); + Earlier->getPointerOperand(), false, + MaybeAlign(Earlier->getAlignment()), Earlier->getOrdering(), + Earlier->getSyncScopeID(), DepWrite); unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, @@ -1268,8 +1293,10 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, OBB.replaceInstruction(DepWrite, SI); // Delete the old stores and now-dead instructions that feed them. - deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB); - deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB); + deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); + deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB, + ThrowableInst); MadeChange = true; // We erased DepWrite and Inst (Loc); start over. @@ -1304,7 +1331,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, // If this block ends in a return, unwind, or unreachable, all allocas are // dead at its end, which means stores to them are also dead. if (BB.getTerminator()->getNumSuccessors() == 0) - MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB); + MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB, ThrowableInst); return MadeChange; } @@ -1361,7 +1388,7 @@ public: MemoryDependenceResults *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); return eliminateDeadStores(F, AA, MD, DT, TLI); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp index e64651d97495..132dfc8f6da1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/DivRemPairs.cpp @@ -1,4 +1,4 @@ -//===- DivRemPairs.cpp - Hoist/decompose division and remainder -*- C++ -*-===// +//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This pass hoists and/or decomposes integer division and remainder +// This pass hoists and/or decomposes/recomposes integer division and remainder // instructions to enable CFG improvements and better codegen. // //===----------------------------------------------------------------------===// @@ -19,20 +19,58 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BypassSlowDivision.h" using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "div-rem-pairs" STATISTIC(NumPairs, "Number of div/rem pairs"); +STATISTIC(NumRecomposed, "Number of instructions recomposed"); STATISTIC(NumHoisted, "Number of instructions hoisted"); STATISTIC(NumDecomposed, "Number of instructions decomposed"); DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform", "Controls transformations in div-rem-pairs pass"); +namespace { +struct ExpandedMatch { + DivRemMapKey Key; + Instruction *Value; +}; +} // namespace + +/// See if we can match: (which is the form we expand into) +/// X - ((X ?/ Y) * Y) +/// which is equivalent to: +/// X ?% Y +static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) { + Value *Dividend, *XroundedDownToMultipleOfY; + if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY)))) + return llvm::None; + + Value *Divisor; + Instruction *Div; + // Look for ((X / Y) * Y) + if (!match( + XroundedDownToMultipleOfY, + m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)), + m_Instruction(Div)), + m_Deferred(Divisor)))) + return llvm::None; + + ExpandedMatch M; + M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv; + M.Key.Dividend = Dividend; + M.Key.Divisor = Divisor; + M.Value = &I; + return M; +} + /// A thin wrapper to store two values that we matched as div-rem pair. /// We want this extra indirection to avoid dealing with RAUW'ing the map keys. struct DivRemPairWorklistEntry { @@ -62,6 +100,16 @@ struct DivRemPairWorklistEntry { /// In this pair, what are the divident and divisor? Value *getDividend() const { return DivInst->getOperand(0); } Value *getDivisor() const { return DivInst->getOperand(1); } + + bool isRemExpanded() const { + switch (RemInst->getOpcode()) { + case Instruction::SRem: + case Instruction::URem: + return false; // single 'rem' instruction - unexpanded form. + default: + return true; // anything else means we have remainder in expanded form. + } + } }; using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>; @@ -87,6 +135,8 @@ static DivRemWorklistTy getWorklist(Function &F) { RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I; else if (I.getOpcode() == Instruction::URem) RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I; + else if (auto Match = matchExpandedRem(I)) + RemMap[Match->Key] = Match->Value; } } @@ -137,11 +187,43 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, // Process each entry in the worklist. for (DivRemPairWorklistEntry &E : Worklist) { + if (!DebugCounter::shouldExecute(DRPCounter)) + continue; + bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned()); auto &DivInst = E.DivInst; auto &RemInst = E.RemInst; + const bool RemOriginallyWasInExpandedForm = E.isRemExpanded(); + (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning + + if (HasDivRemOp && E.isRemExpanded()) { + // The target supports div+rem but the rem is expanded. + // We should recompose it first. + Value *X = E.getDividend(); + Value *Y = E.getDivisor(); + Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y) + : BinaryOperator::CreateURem(X, Y); + // Note that we place it right next to the original expanded instruction, + // and letting further handling to move it if needed. + RealRem->setName(RemInst->getName() + ".recomposed"); + RealRem->insertAfter(RemInst); + Instruction *OrigRemInst = RemInst; + // Update AssertingVH<> with new instruction so it doesn't assert. + RemInst = RealRem; + // And replace the original instruction with the new one. + OrigRemInst->replaceAllUsesWith(RealRem); + OrigRemInst->eraseFromParent(); + NumRecomposed++; + // Note that we have left ((X / Y) * Y) around. + // If it had other uses we could rewrite it as X - X % Y + } + + assert((!E.isRemExpanded() || !HasDivRemOp) && + "*If* the target supports div-rem, then by now the RemInst *is* " + "Instruction::[US]Rem."); + // If the target supports div+rem and the instructions are in the same block // already, there's nothing to do. The backend should handle this. If the // target does not support div+rem, then we will decompose the rem. @@ -149,10 +231,16 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, continue; bool DivDominates = DT.dominates(DivInst, RemInst); - if (!DivDominates && !DT.dominates(RemInst, DivInst)) + if (!DivDominates && !DT.dominates(RemInst, DivInst)) { + // We have matching div-rem pair, but they are in two different blocks, + // neither of which dominates one another. + // FIXME: We could hoist both ops to the common predecessor block? continue; + } - if (!DebugCounter::shouldExecute(DRPCounter)) + // The target does not have a single div/rem operation, + // and the rem is already in expanded form. Nothing to do. + if (!HasDivRemOp && E.isRemExpanded()) continue; if (HasDivRemOp) { @@ -164,9 +252,15 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, DivInst->moveAfter(RemInst); NumHoisted++; } else { - // The target does not have a single div/rem operation. Decompose the - // remainder calculation as: + // The target does not have a single div/rem operation, + // and the rem is *not* in a already-expanded form. + // Decompose the remainder calculation as: // X % Y --> X - ((X / Y) * Y). + + assert(!RemOriginallyWasInExpandedForm && + "We should not be expanding if the rem was in expanded form to " + "begin with."); + Value *X = E.getDividend(); Value *Y = E.getDivisor(); Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index f1f075257020..40c1ba88354f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -45,6 +44,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/AtomicOrdering.h" @@ -55,6 +55,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <deque> #include <memory> @@ -108,11 +109,12 @@ struct SimpleValue { // This can only handle non-void readnone functions. if (CallInst *CI = dyn_cast<CallInst>(Inst)) return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); - return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || - isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || - isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || - isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || - isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); + return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) || + isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || + isa<CmpInst>(Inst) || isa<SelectInst>(Inst) || + isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || + isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) || + isa<InsertValueInst>(Inst); } }; @@ -240,7 +242,7 @@ static unsigned getHashValueImpl(SimpleValue Val) { assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) || isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) || - isa<ShuffleVectorInst>(Inst)) && + isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst)) && "Invalid/unknown instruction"); // Mix in the opcode. @@ -526,7 +528,7 @@ public: const TargetTransformInfo &TTI, DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA) : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA), - MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {} + MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {} bool run(); @@ -651,7 +653,7 @@ private: bool isInvariantLoad() const { if (auto *LI = dyn_cast<LoadInst>(Inst)) - return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr; + return LI->hasMetadata(LLVMContext::MD_invariant_load); return false; } @@ -790,7 +792,7 @@ bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) { // A location loaded from with an invariant_load is assumed to *never* change // within the visible scope of the compilation. if (auto *LI = dyn_cast<LoadInst>(I)) - if (LI->getMetadata(LLVMContext::MD_invariant_load)) + if (LI->hasMetadata(LLVMContext::MD_invariant_load)) return true; auto MemLocOpt = MemoryLocation::getOrNone(I); @@ -905,8 +907,8 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n"); continue; } - if (!salvageDebugInfo(*Inst)) - replaceDbgUsesWithUndef(Inst); + + salvageDebugInfoOrMarkUndef(*Inst); removeMSSA(Inst); Inst->eraseFromParent(); Changed = true; @@ -1359,7 +1361,7 @@ public: if (skipFunction(F)) return false; - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -1381,6 +1383,7 @@ public: AU.addPreserved<MemorySSAWrapperPass>(); } AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); AU.setPreservesCFG(); } }; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index 31670b1464e4..72512430b366 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -11,10 +11,13 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + using namespace llvm; #define DEBUG_TYPE "flattencfg" @@ -52,15 +55,23 @@ FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); } static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) { bool Changed = false; bool LocalChange = true; + + // Use block handles instead of iterating over function blocks directly + // to avoid using iterators invalidated by erasing blocks. + std::vector<WeakVH> Blocks; + Blocks.reserve(F.size()); + for (auto &BB : F) + Blocks.push_back(&BB); + while (LocalChange) { LocalChange = false; - // Loop over all of the basic blocks and remove them if they are unneeded... - // - for (Function::iterator BBIt = F.begin(); BBIt != F.end();) { - if (FlattenCFG(&*BBIt++, AA)) { - LocalChange = true; - } + // Loop over all of the basic blocks and try to flatten them. + for (WeakVH &BlockHandle : Blocks) { + // Skip blocks erased by FlattenCFG. + if (auto *BB = cast_or_null<BasicBlock>(BlockHandle)) + if (FlattenCFG(BB, AA)) + LocalChange = true; } Changed |= LocalChange; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp index 4f83e869b303..af223cc837f2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -11,6 +11,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #define DEBUG_TYPE "float2int" #include "llvm/Transforms/Scalar/Float2Int.h" @@ -60,11 +62,13 @@ namespace { if (skipFunction(F)) return false; - return Impl.runImpl(F); + const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + return Impl.runImpl(F, DT); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -116,21 +120,29 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) { // Find the roots - instructions that convert from the FP domain to // integer domain. -void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { - for (auto &I : instructions(F)) { - if (isa<VectorType>(I.getType())) +void Float2IntPass::findRoots(Function &F, const DominatorTree &DT, + SmallPtrSet<Instruction*,8> &Roots) { + for (BasicBlock &BB : F) { + // Unreachable code can take on strange forms that we are not prepared to + // handle. For example, an instruction may have itself as an operand. + if (!DT.isReachableFromEntry(&BB)) continue; - switch (I.getOpcode()) { - default: break; - case Instruction::FPToUI: - case Instruction::FPToSI: - Roots.insert(&I); - break; - case Instruction::FCmp: - if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != - CmpInst::BAD_ICMP_PREDICATE) + + for (Instruction &I : BB) { + if (isa<VectorType>(I.getType())) + continue; + switch (I.getOpcode()) { + default: break; + case Instruction::FPToUI: + case Instruction::FPToSI: Roots.insert(&I); - break; + break; + case Instruction::FCmp: + if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) != + CmpInst::BAD_ICMP_PREDICATE) + Roots.insert(&I); + break; + } } } } @@ -503,7 +515,7 @@ void Float2IntPass::cleanup() { I.first->eraseFromParent(); } -bool Float2IntPass::runImpl(Function &F) { +bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) { LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n"); // Clear out all state. ECs = EquivalenceClasses<Instruction*>(); @@ -513,7 +525,7 @@ bool Float2IntPass::runImpl(Function &F) { Ctx = &F.getParent()->getContext(); - findRoots(F, Roots); + findRoots(F, DT, Roots); walkBackwards(Roots); walkForwards(); @@ -527,8 +539,9 @@ bool Float2IntPass::runImpl(Function &F) { namespace llvm { FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); } -PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) { - if (!runImpl(F)) +PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) { + const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + if (!runImpl(F, DT)) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp index 542d3b3f7814..1e6aab14e7b4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVN.cpp @@ -64,12 +64,14 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" @@ -111,7 +113,7 @@ static cl::opt<uint32_t> MaxNumDeps( struct llvm::GVN::Expression { uint32_t opcode; - Type *type; + Type *type = nullptr; bool commutative = false; SmallVector<uint32_t, 4> varargs; @@ -172,7 +174,7 @@ struct llvm::gvn::AvailableValue { PointerIntPair<Value *, 2, ValType> Val; /// Offset - The byte offset in Val that is interesting for the load query. - unsigned Offset; + unsigned Offset = 0; static AvailableValue get(Value *V, unsigned Offset = 0) { AvailableValue Res; @@ -236,7 +238,7 @@ struct llvm::gvn::AvailableValue { /// the associated BasicBlock. struct llvm::gvn::AvailableValueInBlock { /// BB - The basic block in question. - BasicBlock *BB; + BasicBlock *BB = nullptr; /// AV - The actual available value AvailableValue AV; @@ -363,6 +365,7 @@ GVN::ValueTable::ValueTable() = default; GVN::ValueTable::ValueTable(const ValueTable &) = default; GVN::ValueTable::ValueTable(ValueTable &&) = default; GVN::ValueTable::~ValueTable() = default; +GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default; /// add - Insert a value into the table with a specified value number. void GVN::ValueTable::add(Value *V, uint32_t num) { @@ -626,6 +629,8 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) { PA.preserve<DominatorTreeAnalysis>(); PA.preserve<GlobalsAA>(); PA.preserve<TargetLibraryAnalysis>(); + if (LI) + PA.preserve<LoopAnalysis>(); return PA; } @@ -1161,15 +1166,30 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, // Do PHI translation to get its value in the predecessor if necessary. The // returned pointer (if non-null) is guaranteed to dominate UnavailablePred. + // We do the translation for each edge we skipped by going from LI's block + // to LoadBB, otherwise we might miss pieces needing translation. // If all preds have a single successor, then we know it is safe to insert // the load on the pred (?!?), so we can insert code to materialize the // pointer if it is not available. - PHITransAddr Address(LI->getPointerOperand(), DL, AC); - Value *LoadPtr = nullptr; - LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, - *DT, NewInsts); + Value *LoadPtr = LI->getPointerOperand(); + BasicBlock *Cur = LI->getParent(); + while (Cur != LoadBB) { + PHITransAddr Address(LoadPtr, DL, AC); + LoadPtr = Address.PHITranslateWithInsertion( + Cur, Cur->getSinglePredecessor(), *DT, NewInsts); + if (!LoadPtr) { + CanDoPRE = false; + break; + } + Cur = Cur->getSinglePredecessor(); + } + if (LoadPtr) { + PHITransAddr Address(LoadPtr, DL, AC); + LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT, + NewInsts); + } // If we couldn't find or insert a computation of this phi translated value, // we fail PRE. if (!LoadPtr) { @@ -1184,8 +1204,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (!CanDoPRE) { while (!NewInsts.empty()) { - Instruction *I = NewInsts.pop_back_val(); - markInstructionForDeletion(I); + // Erase instructions generated by the failed PHI translation before + // trying to number them. PHI translation might insert instructions + // in basic blocks other than the current one, and we delete them + // directly, as markInstructionForDeletion only allows removing from the + // current basic block. + NewInsts.pop_back_val()->eraseFromParent(); } // HINT: Don't revert the edge-splitting as following transformation may // also need to split these critical edges. @@ -1219,10 +1243,10 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, BasicBlock *UnavailablePred = PredLoad.first; Value *LoadPtr = PredLoad.second; - auto *NewLoad = - new LoadInst(LI->getType(), LoadPtr, LI->getName() + ".pre", - LI->isVolatile(), LI->getAlignment(), LI->getOrdering(), - LI->getSyncScopeID(), UnavailablePred->getTerminator()); + auto *NewLoad = new LoadInst( + LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(), + MaybeAlign(LI->getAlignment()), LI->getOrdering(), LI->getSyncScopeID(), + UnavailablePred->getTerminator()); NewLoad->setDebugLoc(LI->getDebugLoc()); // Transfer the old load's AA tags to the new load. @@ -1365,6 +1389,67 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks); } +static bool impliesEquivalanceIfTrue(CmpInst* Cmp) { + if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_EQ) + return true; + + // Floating point comparisons can be equal, but not equivalent. Cases: + // NaNs for unordered operators + // +0.0 vs 0.0 for all operators + if (Cmp->getPredicate() == CmpInst::Predicate::FCMP_OEQ || + (Cmp->getPredicate() == CmpInst::Predicate::FCMP_UEQ && + Cmp->getFastMathFlags().noNaNs())) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + // If we can prove either side non-zero, then equality must imply + // equivalence. + // FIXME: We should do this optimization if 'no signed zeros' is + // applicable via an instruction-level fast-math-flag or some other + // indicator that relaxed FP semantics are being used. + if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero()) + return true; + if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero()) + return true;; + // TODO: Handle vector floating point constants + } + return false; +} + +static bool impliesEquivalanceIfFalse(CmpInst* Cmp) { + if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE) + return true; + + // Floating point comparisons can be equal, but not equivelent. Cases: + // NaNs for unordered operators + // +0.0 vs 0.0 for all operators + if ((Cmp->getPredicate() == CmpInst::Predicate::FCMP_ONE && + Cmp->getFastMathFlags().noNaNs()) || + Cmp->getPredicate() == CmpInst::Predicate::FCMP_UNE) { + Value *LHS = Cmp->getOperand(0); + Value *RHS = Cmp->getOperand(1); + // If we can prove either side non-zero, then equality must imply + // equivalence. + // FIXME: We should do this optimization if 'no signed zeros' is + // applicable via an instruction-level fast-math-flag or some other + // indicator that relaxed FP semantics are being used. + if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero()) + return true; + if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero()) + return true;; + // TODO: Handle vector floating point constants + } + return false; +} + + +static bool hasUsersIn(Value *V, BasicBlock *BB) { + for (User *U : V->users()) + if (isa<Instruction>(U) && + cast<Instruction>(U)->getParent() == BB) + return true; + return false; +} + bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume && "This function can only be called with llvm.assume intrinsic"); @@ -1403,26 +1488,65 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) { // We can replace assume value with true, which covers cases like this: // call void @llvm.assume(i1 %cmp) // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true - ReplaceWithConstMap[V] = True; - - // If one of *cmp *eq operand is const, adding it to map will cover this: + ReplaceOperandsWithMap[V] = True; + + // If we find an equality fact, canonicalize all dominated uses in this block + // to one of the two values. We heuristically choice the "oldest" of the + // two where age is determined by value number. (Note that propagateEquality + // above handles the cross block case.) + // + // Key case to cover are: + // 1) // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen // call void @llvm.assume(i1 %cmp) // ret float %0 ; will change it to ret float 3.000000e+00 + // 2) + // %load = load float, float* %addr + // %cmp = fcmp oeq float %load, %0 + // call void @llvm.assume(i1 %cmp) + // ret float %load ; will change it to ret float %0 if (auto *CmpI = dyn_cast<CmpInst>(V)) { - if (CmpI->getPredicate() == CmpInst::Predicate::ICMP_EQ || - CmpI->getPredicate() == CmpInst::Predicate::FCMP_OEQ || - (CmpI->getPredicate() == CmpInst::Predicate::FCMP_UEQ && - CmpI->getFastMathFlags().noNaNs())) { + if (impliesEquivalanceIfTrue(CmpI)) { Value *CmpLHS = CmpI->getOperand(0); Value *CmpRHS = CmpI->getOperand(1); - if (isa<Constant>(CmpLHS)) + // Heuristically pick the better replacement -- the choice of heuristic + // isn't terribly important here, but the fact we canonicalize on some + // replacement is for exposing other simplifications. + // TODO: pull this out as a helper function and reuse w/existing + // (slightly different) logic. + if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS)) std::swap(CmpLHS, CmpRHS); - auto *RHSConst = dyn_cast<Constant>(CmpRHS); + if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS)) + std::swap(CmpLHS, CmpRHS); + if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) || + (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) { + // Move the 'oldest' value to the right-hand side, using the value + // number as a proxy for age. + uint32_t LVN = VN.lookupOrAdd(CmpLHS); + uint32_t RVN = VN.lookupOrAdd(CmpRHS); + if (LVN < RVN) + std::swap(CmpLHS, CmpRHS); + } - // If only one operand is constant. - if (RHSConst != nullptr && !isa<Constant>(CmpLHS)) - ReplaceWithConstMap[CmpLHS] = RHSConst; + // Handle degenerate case where we either haven't pruned a dead path or a + // removed a trivial assume yet. + if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS)) + return Changed; + + LLVM_DEBUG(dbgs() << "Replacing dominated uses of " + << *CmpLHS << " with " + << *CmpRHS << " in block " + << IntrinsicI->getParent()->getName() << "\n"); + + + // Setup the replacement map - this handles uses within the same block + if (hasUsersIn(CmpLHS, IntrinsicI->getParent())) + ReplaceOperandsWithMap[CmpLHS] = CmpRHS; + + // NOTE: The non-block local cases are handled by the call to + // propagateEquality above; this block is just about handling the block + // local cases. TODO: There's a bunch of logic in propagateEqualiy which + // isn't duplicated for the block local case, can we share it somehow? } } return Changed; @@ -1675,16 +1799,12 @@ void GVN::assignBlockRPONumber(Function &F) { InvalidBlockRPONumbers = false; } -// Tries to replace instruction with const, using information from -// ReplaceWithConstMap. -bool GVN::replaceOperandsWithConsts(Instruction *Instr) const { +bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const { bool Changed = false; for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) { - Value *Operand = Instr->getOperand(OpNum); - auto it = ReplaceWithConstMap.find(Operand); - if (it != ReplaceWithConstMap.end()) { - assert(!isa<Constant>(Operand) && - "Replacing constants with constants is invalid"); + Value *Operand = Instr->getOperand(OpNum); + auto it = ReplaceOperandsWithMap.find(Operand); + if (it != ReplaceOperandsWithMap.end()) { LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with " << *it->second << " in instruction " << *Instr << '\n'); Instr->setOperand(OpNum, it->second); @@ -1801,27 +1921,12 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1); // If "A == B" is known true, or "A != B" is known false, then replace - // A with B everywhere in the scope. - if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) || - (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) + // A with B everywhere in the scope. For floating point operations, we + // have to be careful since equality does not always imply equivalance. + if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) || + (isKnownFalse && impliesEquivalanceIfFalse(Cmp))) Worklist.push_back(std::make_pair(Op0, Op1)); - // Handle the floating point versions of equality comparisons too. - if ((isKnownTrue && Cmp->getPredicate() == CmpInst::FCMP_OEQ) || - (isKnownFalse && Cmp->getPredicate() == CmpInst::FCMP_UNE)) { - - // Floating point -0.0 and 0.0 compare equal, so we can only - // propagate values if we know that we have a constant and that - // its value is non-zero. - - // FIXME: We should do this optimization if 'no signed zeros' is - // applicable via an instruction-level fast-math-flag or some other - // indicator that relaxed FP semantics are being used. - - if (isa<ConstantFP>(Op1) && !cast<ConstantFP>(Op1)->isZero()) - Worklist.push_back(std::make_pair(Op0, Op1)); - } - // If "A >= B" is known true, replace "A < B" with false everywhere. CmpInst::Predicate NotPred = Cmp->getInversePredicate(); Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse); @@ -2014,6 +2119,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT, MD = RunMD; ImplicitControlFlowTracking ImplicitCFT(DT); ICF = &ImplicitCFT; + this->LI = LI; VN.setMemDep(MD); ORE = RunORE; InvalidBlockRPONumbers = true; @@ -2075,13 +2181,13 @@ bool GVN::processBlock(BasicBlock *BB) { return false; // Clearing map before every BB because it can be used only for single BB. - ReplaceWithConstMap.clear(); + ReplaceOperandsWithMap.clear(); bool ChangedFunction = false; for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - if (!ReplaceWithConstMap.empty()) - ChangedFunction |= replaceOperandsWithConsts(&*BI); + if (!ReplaceOperandsWithMap.empty()) + ChangedFunction |= replaceOperandsForInBlockEquality(&*BI); ChangedFunction |= processInstruction(&*BI); if (InstrsToErase.empty()) { @@ -2373,7 +2479,7 @@ bool GVN::performPRE(Function &F) { /// the block inserted to the critical edge. BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) { BasicBlock *BB = - SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT)); + SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT, LI)); if (MD) MD->invalidateCachedPredecessors(); InvalidBlockRPONumbers = true; @@ -2388,7 +2494,7 @@ bool GVN::splitCriticalEdges() { do { std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val(); SplitCriticalEdge(Edge.first, Edge.second, - CriticalEdgeSplittingOptions(DT)); + CriticalEdgeSplittingOptions(DT, LI)); } while (!toSplit.empty()); if (MD) MD->invalidateCachedPredecessors(); InvalidBlockRPONumbers = true; @@ -2494,18 +2600,26 @@ void GVN::addDeadBlock(BasicBlock *BB) { if (DeadBlocks.count(B)) continue; + // First, split the critical edges. This might also create additional blocks + // to preserve LoopSimplify form and adjust edges accordingly. SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B)); for (BasicBlock *P : Preds) { if (!DeadBlocks.count(P)) continue; - if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) { + if (llvm::any_of(successors(P), + [B](BasicBlock *Succ) { return Succ == B; }) && + isCriticalEdge(P->getTerminator(), B)) { if (BasicBlock *S = splitCriticalEdges(P, B)) DeadBlocks.insert(P = S); } + } - for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) { - PHINode &Phi = cast<PHINode>(*II); + // Now undef the incoming values from the dead predecessors. + for (BasicBlock *P : predecessors(B)) { + if (!DeadBlocks.count(P)) + continue; + for (PHINode &Phi : B->phis()) { Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType())); if (MD) MD->invalidateCachedPointerInfo(&Phi); @@ -2582,10 +2696,11 @@ public: return Impl.runImpl( F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), getAnalysis<DominatorTreeWrapperPass>().getDomTree(), - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F), getAnalysis<AAResultsWrapperPass>().getAAResults(), - NoMemDepAnalysis ? nullptr - : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(), + NoMemDepAnalysis + ? nullptr + : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(), LIWP ? &LIWP->getLoopInfo() : nullptr, &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE()); } @@ -2594,6 +2709,7 @@ public: AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); if (!NoMemDepAnalysis) AU.addRequired<MemoryDependenceWrapperPass>(); AU.addRequired<AAResultsWrapperPass>(); @@ -2601,6 +2717,8 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addPreserved<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addPreservedID(LoopSimplifyID); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp index 7614599653c4..e1796f6bf05a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -47,7 +47,6 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/PostDominators.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -65,6 +64,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -72,6 +72,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <iterator> @@ -257,7 +258,7 @@ public: GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA, MemoryDependenceResults *MD, MemorySSA *MSSA) : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA), - MSSAUpdater(llvm::make_unique<MemorySSAUpdater>(MSSA)) {} + MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {} bool run(Function &F) { NumFuncArgs = F.arg_size(); @@ -539,7 +540,7 @@ private: // Check for unsafe hoistings due to side effects. if (K == InsKind::Store) { - if (hasEHOrLoadsOnPath(NewPt, dyn_cast<MemoryDef>(U), NBBsOnAllPaths)) + if (hasEHOrLoadsOnPath(NewPt, cast<MemoryDef>(U), NBBsOnAllPaths)) return false; } else if (hasEHOnPath(NewBB, OldBB, NBBsOnAllPaths)) return false; @@ -889,19 +890,18 @@ private: void updateAlignment(Instruction *I, Instruction *Repl) { if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) { - ReplacementLoad->setAlignment( - std::min(ReplacementLoad->getAlignment(), - cast<LoadInst>(I)->getAlignment())); + ReplacementLoad->setAlignment(MaybeAlign(std::min( + ReplacementLoad->getAlignment(), cast<LoadInst>(I)->getAlignment()))); ++NumLoadsRemoved; } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) { ReplacementStore->setAlignment( - std::min(ReplacementStore->getAlignment(), - cast<StoreInst>(I)->getAlignment())); + MaybeAlign(std::min(ReplacementStore->getAlignment(), + cast<StoreInst>(I)->getAlignment()))); ++NumStoresRemoved; } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) { ReplacementAlloca->setAlignment( - std::max(ReplacementAlloca->getAlignment(), - cast<AllocaInst>(I)->getAlignment())); + MaybeAlign(std::max(ReplacementAlloca->getAlignment(), + cast<AllocaInst>(I)->getAlignment()))); } else if (isa<CallInst>(Repl)) { ++NumCallsRemoved; } @@ -957,7 +957,8 @@ private: if (MoveAccess && NewMemAcc) { // The definition of this ld/st will not change: ld/st hoisting is // legal when the ld/st is not moved past its current definition. - MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::End); + MSSAUpdater->moveToPlace(NewMemAcc, DestBB, + MemorySSA::BeforeTerminator); } // Replace all other instructions with Repl with memory access NewMemAcc. @@ -1068,6 +1069,9 @@ private: ++NI; } + if (MSSA && VerifyMemorySSA) + MSSA->verifyMemorySSA(); + NumHoisted += NL + NS + NC + NI; NumRemoved += NR; NumLoadsHoisted += NL; @@ -1169,6 +1173,7 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<MemorySSAWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); } }; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp index 054025755c69..6d0a4975e266 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -47,7 +47,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -59,6 +58,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" @@ -71,6 +71,7 @@ #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/GVNExpression.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp index e14f44bb7069..a3eba27a4d90 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -39,7 +39,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/GuardWidening.h" -#include <functional> #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/Statistic.h" @@ -53,11 +52,15 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/GuardUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include <functional> using namespace llvm; @@ -66,22 +69,6 @@ using namespace llvm; STATISTIC(GuardsEliminated, "Number of eliminated guards"); STATISTIC(CondBranchEliminated, "Number of eliminated conditional branches"); -static cl::opt<bool> WidenFrequentBranches( - "guard-widening-widen-frequent-branches", cl::Hidden, - cl::desc("Widen conditions of explicit branches into dominating guards in " - "case if their taken frequency exceeds threshold set by " - "guard-widening-frequent-branch-threshold option"), - cl::init(false)); - -static cl::opt<unsigned> FrequentBranchThreshold( - "guard-widening-frequent-branch-threshold", cl::Hidden, - cl::desc("When WidenFrequentBranches is set to true, this option is used " - "to determine which branches are frequently taken. The criteria " - "that a branch is taken more often than " - "((FrequentBranchThreshold - 1) / FrequentBranchThreshold), then " - "it is considered frequently taken"), - cl::init(1000)); - static cl::opt<bool> WidenBranchGuards("guard-widening-widen-branch-guards", cl::Hidden, cl::desc("Whether or not we should widen guards " @@ -97,15 +84,16 @@ static Value *getCondition(Instruction *I) { "Bad guard intrinsic?"); return GI->getArgOperand(0); } - if (isGuardAsWidenableBranch(I)) { - auto *Cond = cast<BranchInst>(I)->getCondition(); - return cast<BinaryOperator>(Cond)->getOperand(0); - } + Value *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB)) + return Cond; + return cast<BranchInst>(I)->getCondition(); } // Set the condition for \p I to \p NewCond. \p I can either be a guard or a -// conditional branch. +// conditional branch. static void setCondition(Instruction *I, Value *NewCond) { if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) { assert(GI->getIntrinsicID() == Intrinsic::experimental_guard && @@ -126,7 +114,6 @@ class GuardWideningImpl { DominatorTree &DT; PostDominatorTree *PDT; LoopInfo &LI; - BranchProbabilityInfo *BPI; /// Together, these describe the region of interest. This might be all of /// the blocks within a function, or only a given loop's blocks and preheader. @@ -271,26 +258,22 @@ class GuardWideningImpl { void widenGuard(Instruction *ToWiden, Value *NewCondition, bool InvertCondition) { Value *Result; + widenCondCommon(getCondition(ToWiden), NewCondition, ToWiden, Result, InvertCondition); - Value *WidenableCondition = nullptr; if (isGuardAsWidenableBranch(ToWiden)) { - auto *Cond = cast<BranchInst>(ToWiden)->getCondition(); - WidenableCondition = cast<BinaryOperator>(Cond)->getOperand(1); + setWidenableBranchCond(cast<BranchInst>(ToWiden), Result); + return; } - if (WidenableCondition) - Result = BinaryOperator::CreateAnd(Result, WidenableCondition, - "guard.chk", ToWiden); setCondition(ToWiden, Result); } public: explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT, - LoopInfo &LI, BranchProbabilityInfo *BPI, - DomTreeNode *Root, + LoopInfo &LI, DomTreeNode *Root, std::function<bool(BasicBlock*)> BlockFilter) - : DT(DT), PDT(PDT), LI(LI), BPI(BPI), Root(Root), BlockFilter(BlockFilter) + : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter) {} /// The entry point for this pass. @@ -309,13 +292,6 @@ static bool isSupportedGuardInstruction(const Instruction *Insn) { bool GuardWideningImpl::run() { DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> GuardsInBlock; bool Changed = false; - Optional<BranchProbability> LikelyTaken = None; - if (WidenFrequentBranches && BPI) { - unsigned Threshold = FrequentBranchThreshold; - assert(Threshold > 0 && "Zero threshold makes no sense!"); - LikelyTaken = BranchProbability(Threshold - 1, Threshold); - } - for (auto DFI = df_begin(Root), DFE = df_end(Root); DFI != DFE; ++DFI) { auto *BB = (*DFI)->getBlock(); @@ -330,17 +306,6 @@ bool GuardWideningImpl::run() { for (auto *II : CurrentList) Changed |= eliminateInstrViaWidening(II, DFI, GuardsInBlock); - if (WidenFrequentBranches && BPI) - if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) - if (BI->isConditional()) { - // If one of branches of a conditional is likely taken, try to - // eliminate it. - if (BPI->getEdgeProbability(BB, 0U) >= *LikelyTaken) - Changed |= eliminateInstrViaWidening(BI, DFI, GuardsInBlock); - else if (BPI->getEdgeProbability(BB, 1U) >= *LikelyTaken) - Changed |= eliminateInstrViaWidening(BI, DFI, GuardsInBlock, - /*InvertCondition*/true); - } } assert(EliminatedGuardsAndBranches.empty() || Changed); @@ -591,7 +556,7 @@ bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1, else Result = RC.getCheckInst(); } - + assert(Result && "Failed to find result value"); Result->setName("wide.chk"); } return true; @@ -805,10 +770,7 @@ PreservedAnalyses GuardWideningPass::run(Function &F, auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = AM.getCachedResult<BranchProbabilityAnalysis>(F); - if (!GuardWideningImpl(DT, &PDT, LI, BPI, DT.getRootNode(), + if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), [](BasicBlock*) { return true; } ).run()) return PreservedAnalyses::all(); @@ -820,22 +782,13 @@ PreservedAnalyses GuardWideningPass::run(Function &F, PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U) { - - const auto &FAM = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); - Function &F = *L.getHeader()->getParent(); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(F); - BasicBlock *RootBB = L.getLoopPredecessor(); if (!RootBB) RootBB = L.getHeader(); auto BlockFilter = [&](BasicBlock *BB) { return BB == RootBB || L.contains(BB); }; - if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, BPI, - AR.DT.getNode(RootBB), + if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB), BlockFilter).run()) return PreservedAnalyses::all(); @@ -856,10 +809,7 @@ struct GuardWideningLegacyPass : public FunctionPass { auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(); - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); - return GuardWideningImpl(DT, &PDT, LI, BPI, DT.getRootNode(), + return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(), [](BasicBlock*) { return true; } ).run(); } @@ -868,8 +818,6 @@ struct GuardWideningLegacyPass : public FunctionPass { AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<PostDominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); - if (WidenFrequentBranches) - AU.addRequired<BranchProbabilityInfoWrapperPass>(); } }; @@ -895,16 +843,11 @@ struct LoopGuardWideningLegacyPass : public LoopPass { auto BlockFilter = [&](BasicBlock *BB) { return BB == RootBB || L->contains(BB); }; - BranchProbabilityInfo *BPI = nullptr; - if (WidenFrequentBranches) - BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); - return GuardWideningImpl(DT, PDT, LI, BPI, + return GuardWideningImpl(DT, PDT, LI, DT.getNode(RootBB), BlockFilter).run(); } void getAnalysisUsage(AnalysisUsage &AU) const override { - if (WidenFrequentBranches) - AU.addRequired<BranchProbabilityInfoWrapperPass>(); AU.setPreservesCFG(); getLoopAnalysisUsage(AU); AU.addPreserved<PostDominatorTreeWrapperPass>(); @@ -920,8 +863,6 @@ INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards", INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -if (WidenFrequentBranches) - INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards", false, false) @@ -931,8 +872,6 @@ INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening", INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -if (WidenFrequentBranches) - INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass) INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening", "Widen guards (within a single loop, as a loop pass)", false, false) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index f9fc698a4a9b..d8d7acae5c9f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -31,8 +31,8 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" @@ -44,7 +44,6 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/ConstantRange.h" @@ -68,6 +67,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -79,6 +79,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" #include <cassert> @@ -124,6 +125,10 @@ static cl::opt<bool> DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), cl::desc("Disable Linear Function Test Replace optimization")); +static cl::opt<bool> +LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true), + cl::desc("Predicate conditions in read only loops")); + namespace { struct RewritePhi; @@ -144,7 +149,11 @@ class IndVarSimplify { bool rewriteNonIntegerIVs(Loop *L); bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI); - bool optimizeLoopExits(Loop *L); + /// Try to eliminate loop exits based on analyzeable exit counts + bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter); + /// Try to form loop invariant tests for loop exits by changing how many + /// iterations of the loop run when that is unobservable. + bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet); bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter); @@ -628,12 +637,30 @@ bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // Okay, this instruction has a user outside of the current loop // and varies predictably *inside* the loop. Evaluate the value it - // contains when the loop exits, if possible. + // contains when the loop exits, if possible. We prefer to start with + // expressions which are true for all exits (so as to maximize + // expression reuse by the SCEVExpander), but resort to per-exit + // evaluation if that fails. const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); - if (!SE->isLoopInvariant(ExitValue, L) || - !isSafeToExpand(ExitValue, *SE)) - continue; - + if (isa<SCEVCouldNotCompute>(ExitValue) || + !SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) { + // TODO: This should probably be sunk into SCEV in some way; maybe a + // getSCEVForExit(SCEV*, L, ExitingBB)? It can be generalized for + // most SCEV expressions and other recurrence types (e.g. shift + // recurrences). Is there existing code we can reuse? + const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i)); + if (isa<SCEVCouldNotCompute>(ExitCount)) + continue; + if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst))) + if (AddRec->getLoop() == L) + ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE); + if (isa<SCEVCouldNotCompute>(ExitValue) || + !SE->isLoopInvariant(ExitValue, L) || + !isSafeToExpand(ExitValue, *SE)) + continue; + } + // Computing the value outside of the loop brings no benefit if it is // definitely used inside the loop in a way which can not be optimized // away. Avoid doing so unless we know we have a value which computes @@ -804,7 +831,7 @@ bool IndVarSimplify::canLoopBeDeleted( L->getExitingBlocks(ExitingBlocks); SmallVector<BasicBlock *, 8> ExitBlocks; L->getUniqueExitBlocks(ExitBlocks); - if (ExitBlocks.size() > 1 || ExitingBlocks.size() > 1) + if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1) return false; BasicBlock *ExitBlock = ExitBlocks[0]; @@ -1654,6 +1681,10 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) { return nullptr; } + // if we reached this point then we are going to replace + // DU.NarrowUse with WideUse. Reattach DbgValue then. + replaceAllDbgUsesWith(*DU.NarrowUse, *WideUse, *WideUse, *DT); + ExtendKindMap[DU.NarrowUse] = WideAddRec.second; // Returning WideUse pushes it on the worklist. return WideUse; @@ -1779,14 +1810,9 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) { DeadInsts.emplace_back(DU.NarrowDef); } - // Attach any debug information to the new PHI. Since OrigPhi and WidePHI - // evaluate the same recurrence, we can just copy the debug info over. - SmallVector<DbgValueInst *, 1> DbgValues; - llvm::findDbgValues(DbgValues, OrigPhi); - auto *MDPhi = MetadataAsValue::get(WidePhi->getContext(), - ValueAsMetadata::get(WidePhi)); - for (auto &DbgValue : DbgValues) - DbgValue->setOperand(0, MDPhi); + // Attach any debug information to the new PHI. + replaceAllDbgUsesWith(*OrigPhi, *WidePhi, *WidePhi, *DT); + return WidePhi; } @@ -1817,8 +1843,8 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef, auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS)); auto CmpConstrainedLHSRange = ConstantRange::makeAllowedICmpRegion(P, CmpRHSRange); - auto NarrowDefRange = - CmpConstrainedLHSRange.addWithNoSignedWrap(*NarrowDefRHS); + auto NarrowDefRange = CmpConstrainedLHSRange.addWithNoWrap( + *NarrowDefRHS, OverflowingBinaryOperator::NoSignedWrap); updatePostIncRangeInfo(NarrowDef, NarrowUser, NarrowDefRange); }; @@ -2242,8 +2268,8 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB, if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy()) continue; - const auto *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); - + const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi)); + // AR may be a pointer type, while BECount is an integer type. // AR may be wider than BECount. With eq/ne tests overflow is immaterial. // AR may not be a narrower type, or we may never exit. @@ -2624,74 +2650,125 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) { return MadeAnyChanges; } -bool IndVarSimplify::optimizeLoopExits(Loop *L) { +/// Return a symbolic upper bound for the backedge taken count of the loop. +/// This is more general than getConstantMaxBackedgeTakenCount as it returns +/// an arbitrary expression as opposed to only constants. +/// TODO: Move into the ScalarEvolution class. +static const SCEV* getMaxBackedgeTakenCount(ScalarEvolution &SE, + DominatorTree &DT, Loop *L) { SmallVector<BasicBlock*, 16> ExitingBlocks; L->getExitingBlocks(ExitingBlocks); // Form an expression for the maximum exit count possible for this loop. We // merge the max and exact information to approximate a version of - // getMaxBackedgeTakenInfo which isn't restricted to just constants. - // TODO: factor this out as a version of getMaxBackedgeTakenCount which - // isn't guaranteed to return a constant. + // getConstantMaxBackedgeTakenCount which isn't restricted to just constants. SmallVector<const SCEV*, 4> ExitCounts; - const SCEV *MaxConstEC = SE->getMaxBackedgeTakenCount(L); - if (!isa<SCEVCouldNotCompute>(MaxConstEC)) - ExitCounts.push_back(MaxConstEC); for (BasicBlock *ExitingBB : ExitingBlocks) { - const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount)) + ExitCount = SE.getExitCount(L, ExitingBB, + ScalarEvolution::ConstantMaximum); if (!isa<SCEVCouldNotCompute>(ExitCount)) { - assert(DT->dominates(ExitingBB, L->getLoopLatch()) && + assert(DT.dominates(ExitingBB, L->getLoopLatch()) && "We should only have known counts for exiting blocks that " "dominate latch!"); ExitCounts.push_back(ExitCount); } } if (ExitCounts.empty()) - return false; - const SCEV *MaxExitCount = SE->getUMinFromMismatchedTypes(ExitCounts); + return SE.getCouldNotCompute(); + return SE.getUMinFromMismatchedTypes(ExitCounts); +} - bool Changed = false; - for (BasicBlock *ExitingBB : ExitingBlocks) { +bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { + SmallVector<BasicBlock*, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + // Remove all exits which aren't both rewriteable and analyzeable. + auto NewEnd = llvm::remove_if(ExitingBlocks, + [&](BasicBlock *ExitingBB) { // If our exitting block exits multiple loops, we can only rewrite the // innermost one. Otherwise, we're changing how many times the innermost // loop runs before it exits. if (LI->getLoopFor(ExitingBB) != L) - continue; + return true; // Can't rewrite non-branch yet. BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); if (!BI) - continue; + return true; // If already constant, nothing to do. if (isa<Constant>(BI->getCondition())) - continue; + return true; const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); if (isa<SCEVCouldNotCompute>(ExitCount)) - continue; + return true; + return false; + }); + ExitingBlocks.erase(NewEnd, ExitingBlocks.end()); + + if (ExitingBlocks.empty()) + return false; + + // Get a symbolic upper bound on the loop backedge taken count. + const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L); + if (isa<SCEVCouldNotCompute>(MaxExitCount)) + return false; + // Visit our exit blocks in order of dominance. We know from the fact that + // all exits (left) are analyzeable that the must be a total dominance order + // between them as each must dominate the latch. The visit order only + // matters for the provably equal case. + llvm::sort(ExitingBlocks, + [&](BasicBlock *A, BasicBlock *B) { + // std::sort sorts in ascending order, so we want the inverse of + // the normal dominance relation. + if (DT->properlyDominates(A, B)) return true; + if (DT->properlyDominates(B, A)) return false; + llvm_unreachable("expected total dominance order!"); + }); +#ifdef ASSERT + for (unsigned i = 1; i < ExitingBlocks.size(); i++) { + assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])); + } +#endif + + auto FoldExit = [&](BasicBlock *ExitingBB, bool IsTaken) { + BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator()); + bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); + auto *OldCond = BI->getCondition(); + auto *NewCond = ConstantInt::get(OldCond->getType(), + IsTaken ? ExitIfTrue : !ExitIfTrue); + BI->setCondition(NewCond); + if (OldCond->use_empty()) + DeadInsts.push_back(OldCond); + }; + + bool Changed = false; + SmallSet<const SCEV*, 8> DominatingExitCounts; + for (BasicBlock *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + assert(!isa<SCEVCouldNotCompute>(ExitCount) && "checked above"); + // If we know we'd exit on the first iteration, rewrite the exit to // reflect this. This does not imply the loop must exit through this // exit; there may be an earlier one taken on the first iteration. // TODO: Given we know the backedge can't be taken, we should go ahead // and break it. Or at least, kill all the header phis and simplify. if (ExitCount->isZero()) { - bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); - auto *OldCond = BI->getCondition(); - auto *NewCond = ExitIfTrue ? ConstantInt::getTrue(OldCond->getType()) : - ConstantInt::getFalse(OldCond->getType()); - BI->setCondition(NewCond); - if (OldCond->use_empty()) - DeadInsts.push_back(OldCond); + FoldExit(ExitingBB, true); Changed = true; continue; } - // If we end up with a pointer exit count, bail. + // If we end up with a pointer exit count, bail. Note that we can end up + // with a pointer exit count for one exiting block, and not for another in + // the same loop. if (!ExitCount->getType()->isIntegerTy() || !MaxExitCount->getType()->isIntegerTy()) - return false; + continue; Type *WiderType = SE->getWiderType(MaxExitCount->getType(), ExitCount->getType()); @@ -2700,32 +2777,203 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L) { assert(MaxExitCount->getType() == ExitCount->getType()); // Can we prove that some other exit must be taken strictly before this - // one? TODO: handle cases where ule is known, and equality is covered - // by a dominating exit + // one? if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT, MaxExitCount, ExitCount)) { - bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); - auto *OldCond = BI->getCondition(); - auto *NewCond = ExitIfTrue ? ConstantInt::getFalse(OldCond->getType()) : - ConstantInt::getTrue(OldCond->getType()); - BI->setCondition(NewCond); - if (OldCond->use_empty()) - DeadInsts.push_back(OldCond); + FoldExit(ExitingBB, false); Changed = true; continue; } - // TODO: If we can prove that the exiting iteration is equal to the exit - // count for this exit and that no previous exit oppurtunities exist within - // the loop, then we can discharge all other exits. (May fall out of - // previous TODO.) - - // TODO: If we can't prove any relation between our exit count and the - // loops exit count, but taking this exit doesn't require actually running - // the loop (i.e. no side effects, no computed values used in exit), then - // we can replace the exit test with a loop invariant test which exits on - // the first iteration. + // As we run, keep track of which exit counts we've encountered. If we + // find a duplicate, we've found an exit which would have exited on the + // exiting iteration, but (from the visit order) strictly follows another + // which does the same and is thus dead. + if (!DominatingExitCounts.insert(ExitCount).second) { + FoldExit(ExitingBB, false); + Changed = true; + continue; + } + + // TODO: There might be another oppurtunity to leverage SCEV's reasoning + // here. If we kept track of the min of dominanting exits so far, we could + // discharge exits with EC >= MDEC. This is less powerful than the existing + // transform (since later exits aren't considered), but potentially more + // powerful for any case where SCEV can prove a >=u b, but neither a == b + // or a >u b. Such a case is not currently known. + } + return Changed; +} + +bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { + SmallVector<BasicBlock*, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + bool Changed = false; + + // Finally, see if we can rewrite our exit conditions into a loop invariant + // form. If we have a read-only loop, and we can tell that we must exit down + // a path which does not need any of the values computed within the loop, we + // can rewrite the loop to exit on the first iteration. Note that this + // doesn't either a) tell us the loop exits on the first iteration (unless + // *all* exits are predicateable) or b) tell us *which* exit might be taken. + // This transformation looks a lot like a restricted form of dead loop + // elimination, but restricted to read-only loops and without neccesssarily + // needing to kill the loop entirely. + if (!LoopPredication) + return Changed; + + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return Changed; + + // Note: ExactBTC is the exact backedge taken count *iff* the loop exits + // through *explicit* control flow. We have to eliminate the possibility of + // implicit exits (see below) before we know it's truly exact. + const SCEV *ExactBTC = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(ExactBTC) || + !SE->isLoopInvariant(ExactBTC, L) || + !isSafeToExpand(ExactBTC, *SE)) + return Changed; + + // If we end up with a pointer exit count, bail. It may be unsized. + if (!ExactBTC->getType()->isIntegerTy()) + return Changed; + + auto BadExit = [&](BasicBlock *ExitingBB) { + // If our exiting block exits multiple loops, we can only rewrite the + // innermost one. Otherwise, we're changing how many times the innermost + // loop runs before it exits. + if (LI->getLoopFor(ExitingBB) != L) + return true; + + // Can't rewrite non-branch yet. + BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + return true; + + // If already constant, nothing to do. + if (isa<Constant>(BI->getCondition())) + return true; + + // If the exit block has phis, we need to be able to compute the values + // within the loop which contains them. This assumes trivially lcssa phis + // have already been removed; TODO: generalize + BasicBlock *ExitBlock = + BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0); + if (!ExitBlock->phis().empty()) + return true; + + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count"); + if (!SE->isLoopInvariant(ExitCount, L) || + !isSafeToExpand(ExitCount, *SE)) + return true; + + // If we end up with a pointer exit count, bail. It may be unsized. + if (!ExitCount->getType()->isIntegerTy()) + return true; + + return false; + }; + + // If we have any exits which can't be predicated themselves, than we can't + // predicate any exit which isn't guaranteed to execute before it. Consider + // two exits (a) and (b) which would both exit on the same iteration. If we + // can predicate (b), but not (a), and (a) preceeds (b) along some path, then + // we could convert a loop from exiting through (a) to one exiting through + // (b). Note that this problem exists only for exits with the same exit + // count, and we could be more aggressive when exit counts are known inequal. + llvm::sort(ExitingBlocks, + [&](BasicBlock *A, BasicBlock *B) { + // std::sort sorts in ascending order, so we want the inverse of + // the normal dominance relation, plus a tie breaker for blocks + // unordered by dominance. + if (DT->properlyDominates(A, B)) return true; + if (DT->properlyDominates(B, A)) return false; + return A->getName() < B->getName(); + }); + // Check to see if our exit blocks are a total order (i.e. a linear chain of + // exits before the backedge). If they aren't, reasoning about reachability + // is complicated and we choose not to for now. + for (unsigned i = 1; i < ExitingBlocks.size(); i++) + if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i])) + return Changed; + + // Given our sorted total order, we know that exit[j] must be evaluated + // after all exit[i] such j > i. + for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++) + if (BadExit(ExitingBlocks[i])) { + ExitingBlocks.resize(i); + break; + } + + if (ExitingBlocks.empty()) + return Changed; + + // We rely on not being able to reach an exiting block on a later iteration + // then it's statically compute exit count. The implementaton of + // getExitCount currently has this invariant, but assert it here so that + // breakage is obvious if this ever changes.. + assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) { + return DT->dominates(ExitingBB, L->getLoopLatch()); + })); + + // At this point, ExitingBlocks consists of only those blocks which are + // predicatable. Given that, we know we have at least one exit we can + // predicate if the loop is doesn't have side effects and doesn't have any + // implicit exits (because then our exact BTC isn't actually exact). + // @Reviewers - As structured, this is O(I^2) for loop nests. Any + // suggestions on how to improve this? I can obviously bail out for outer + // loops, but that seems less than ideal. MemorySSA can find memory writes, + // is that enough for *all* side effects? + for (BasicBlock *BB : L->blocks()) + for (auto &I : *BB) + // TODO:isGuaranteedToTransfer + if (I.mayHaveSideEffects() || I.mayThrow()) + return Changed; + + // Finally, do the actual predication for all predicatable blocks. A couple + // of notes here: + // 1) We don't bother to constant fold dominated exits with identical exit + // counts; that's simply a form of CSE/equality propagation and we leave + // it for dedicated passes. + // 2) We insert the comparison at the branch. Hoisting introduces additional + // legality constraints and we leave that to dedicated logic. We want to + // predicate even if we can't insert a loop invariant expression as + // peeling or unrolling will likely reduce the cost of the otherwise loop + // varying check. + Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator()); + IRBuilder<> B(L->getLoopPreheader()->getTerminator()); + Value *ExactBTCV = nullptr; // Lazily generated if needed. + for (BasicBlock *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + + auto *BI = cast<BranchInst>(ExitingBB->getTerminator()); + Value *NewCond; + if (ExitCount == ExactBTC) { + NewCond = L->contains(BI->getSuccessor(0)) ? + B.getFalse() : B.getTrue(); + } else { + Value *ECV = Rewriter.expandCodeFor(ExitCount); + if (!ExactBTCV) + ExactBTCV = Rewriter.expandCodeFor(ExactBTC); + Value *RHS = ExactBTCV; + if (ECV->getType() != RHS->getType()) { + Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); + ECV = B.CreateZExt(ECV, WiderTy); + RHS = B.CreateZExt(RHS, WiderTy); + } + auto Pred = L->contains(BI->getSuccessor(0)) ? + ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ; + NewCond = B.CreateICmp(Pred, ECV, RHS); + } + Value *OldCond = BI->getCondition(); + BI->setCondition(NewCond); + if (OldCond->use_empty()) + DeadInsts.push_back(OldCond); + Changed = true; } + return Changed; } @@ -2751,12 +2999,15 @@ bool IndVarSimplify::run(Loop *L) { if (!L->isLoopSimplifyForm()) return false; +#ifndef NDEBUG + // Used below for a consistency check only + const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); +#endif + // If there are any floating-point recurrences, attempt to // transform them to use integer recurrences. Changed |= rewriteNonIntegerIVs(L); - const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L); - // Create a rewriter object which we'll use to transform the code with. SCEVExpander Rewriter(*SE, DL, "indvars"); #ifndef NDEBUG @@ -2772,20 +3023,30 @@ bool IndVarSimplify::run(Loop *L) { Rewriter.disableCanonicalMode(); Changed |= simplifyAndExtend(L, Rewriter, LI); - // Check to see if this loop has a computable loop-invariant execution count. - // If so, this means that we can compute the final value of any expressions + // Check to see if we can compute the final value of any expressions // that are recurrent in the loop, and substitute the exit values from the - // loop into any instructions outside of the loop that use the final values of - // the current expressions. - // - if (ReplaceExitValue != NeverRepl && - !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) + // loop into any instructions outside of the loop that use the final values + // of the current expressions. + if (ReplaceExitValue != NeverRepl) Changed |= rewriteLoopExitValues(L, Rewriter); // Eliminate redundant IV cycles. NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts); - Changed |= optimizeLoopExits(L); + // Try to eliminate loop exits based on analyzeable exit counts + if (optimizeLoopExits(L, Rewriter)) { + Changed = true; + // Given we've changed exit counts, notify SCEV + SE->forgetLoop(L); + } + + // Try to form loop invariant tests for loop exits by changing how many + // iterations of the loop run when that is unobservable. + if (predicateLoopExits(L, Rewriter)) { + Changed = true; + // Given we've changed exit counts, notify SCEV + SE->forgetLoop(L); + } // If we have a trip count expression, rewrite the loop's exit condition // using it. @@ -2825,7 +3086,7 @@ bool IndVarSimplify::run(Loop *L) { // that our definition of "high cost" is not exactly principled. if (Rewriter.isHighCostExpansion(ExitCount, L)) continue; - + // Check preconditions for proper SCEVExpander operation. SCEV does not // express SCEVExpander's dependencies, such as LoopSimplify. Instead // any pass that uses the SCEVExpander must do it. This does not work @@ -2873,7 +3134,8 @@ bool IndVarSimplify::run(Loop *L) { "Indvars did not preserve LCSSA!"); // Verify that LFTR, and any other change have not interfered with SCEV's - // ability to compute trip count. + // ability to compute trip count. We may have *changed* the exit count, but + // only by reducing it. #ifndef NDEBUG if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) { SE->forgetLoop(L); @@ -2885,7 +3147,8 @@ bool IndVarSimplify::run(Loop *L) { else BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, NewBECount->getType()); - assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV"); + assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount, + NewBECount) && "indvars must preserve SCEV"); } #endif @@ -2924,7 +3187,7 @@ struct IndVarSimplifyLegacyPass : public LoopPass { auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>(); auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 997d68838152..58469749600e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -74,6 +74,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 5f0e2001c73d..dfb1b6bfb739 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -141,9 +141,11 @@ using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; /// InferAddressSpaces class InferAddressSpaces : public FunctionPass { + const TargetTransformInfo *TTI = nullptr; + /// Target specific address space which uses of should be replaced if /// possible. - unsigned FlatAddrSpace; + unsigned FlatAddrSpace = 0; public: static char ID; @@ -264,17 +266,6 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II, Module *M = II->getParent()->getParent()->getParent(); switch (II->getIntrinsicID()) { - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_ds_fadd: - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: { - const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4)); - if (!IsVolatile || !IsVolatile->isZero()) - return false; - - LLVM_FALLTHROUGH; - } case Intrinsic::objectsize: { Type *DestTy = II->getType(); Type *SrcTy = NewV->getType(); @@ -285,25 +276,27 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II, return true; } default: - return false; + return TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV); } } -// TODO: Move logic to TTI? void InferAddressSpaces::collectRewritableIntrinsicOperands( IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack, DenseSet<Value *> &Visited) const { - switch (II->getIntrinsicID()) { + auto IID = II->getIntrinsicID(); + switch (IID) { case Intrinsic::objectsize: - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_ds_fadd: - case Intrinsic::amdgcn_ds_fmin: - case Intrinsic::amdgcn_ds_fmax: appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), PostorderStack, Visited); break; default: + SmallVector<int, 2> OpIndexes; + if (TTI->collectFlatAddressOperands(OpIndexes, IID)) { + for (int Idx : OpIndexes) { + appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx), + PostorderStack, Visited); + } + } break; } } @@ -631,11 +624,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) { if (skipFunction(F)) return false; - const TargetTransformInfo &TTI = - getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); if (FlatAddrSpace == UninitializedAddressSpace) { - FlatAddrSpace = TTI.getFlatAddressSpace(); + FlatAddrSpace = TTI->getFlatAddressSpace(); if (FlatAddrSpace == UninitializedAddressSpace) return false; } @@ -650,7 +642,7 @@ bool InferAddressSpaces::runOnFunction(Function &F) { // Changes the address spaces of the flat address expressions who are inferred // to point to a specific address space. - return rewriteWithNewAddressSpaces(TTI, Postorder, InferredAddrSpace, &F); + return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F); } // Constants need to be tracked through RAUW to handle cases with nested @@ -799,8 +791,8 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); if (auto *MSI = dyn_cast<MemSetInst>(MI)) { - B.CreateMemSet(NewV, MSI->getValue(), - MSI->getLength(), MSI->getDestAlignment(), + B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(), + MaybeAlign(MSI->getDestAlignment()), false, // isVolatile TBAA, ScopeMD, NoAliasMD); } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) { @@ -816,15 +808,13 @@ static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, if (isa<MemCpyInst>(MTI)) { MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); - B.CreateMemCpy(Dest, MTI->getDestAlignment(), - Src, MTI->getSourceAlignment(), + B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile TBAA, TBAAStruct, ScopeMD, NoAliasMD); } else { assert(isa<MemMoveInst>(MTI)); - B.CreateMemMove(Dest, MTI->getDestAlignment(), - Src, MTI->getSourceAlignment(), + B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), false, // isVolatile TBAA, ScopeMD, NoAliasMD); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp index 6616364ab203..e8bbf2936da6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Local.h" @@ -33,37 +34,39 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ, bool Changed = false; do { - for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { - // Here be subtlety: the iterator must be incremented before the loop - // body (not sure why), so a range-for loop won't work here. - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { - Instruction *I = &*BI++; - // The first time through the loop ToSimplify is empty and we try to - // simplify all instructions. On later iterations ToSimplify is not + for (BasicBlock &BB : F) { + // Unreachable code can take on strange forms that we are not prepared to + // handle. For example, an instruction may have itself as an operand. + if (!SQ.DT->isReachableFromEntry(&BB)) + continue; + + SmallVector<Instruction *, 8> DeadInstsInBB; + for (Instruction &I : BB) { + // The first time through the loop, ToSimplify is empty and we try to + // simplify all instructions. On later iterations, ToSimplify is not // empty and we only bother simplifying instructions that are in it. - if (!ToSimplify->empty() && !ToSimplify->count(I)) + if (!ToSimplify->empty() && !ToSimplify->count(&I)) continue; - // Don't waste time simplifying unused instructions. - if (!I->use_empty()) { - if (Value *V = SimplifyInstruction(I, SQ, ORE)) { + // Don't waste time simplifying dead/unused instructions. + if (isInstructionTriviallyDead(&I)) { + DeadInstsInBB.push_back(&I); + Changed = true; + } else if (!I.use_empty()) { + if (Value *V = SimplifyInstruction(&I, SQ, ORE)) { // Mark all uses for resimplification next time round the loop. - for (User *U : I->users()) + for (User *U : I.users()) Next->insert(cast<Instruction>(U)); - I->replaceAllUsesWith(V); + I.replaceAllUsesWith(V); ++NumSimplified; Changed = true; + // A call can get simplified, but it may not be trivially dead. + if (isInstructionTriviallyDead(&I)) + DeadInstsInBB.push_back(&I); } } - if (RecursivelyDeleteTriviallyDeadInstructions(I, SQ.TLI)) { - // RecursivelyDeleteTriviallyDeadInstruction can remove more than one - // instruction, so simply incrementing the iterator does not work. - // When instructions get deleted re-iterate instead. - BI = BB->begin(); - BE = BB->end(); - Changed = true; - } } + RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI); } // Place the list of instructions to simplify on the next loop iteration @@ -90,7 +93,7 @@ struct InstSimplifyLegacyPass : public FunctionPass { AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } - /// runOnFunction - Remove instructions that simplify. + /// Remove instructions that simplify. bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; @@ -98,7 +101,7 @@ struct InstSimplifyLegacyPass : public FunctionPass { const DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); AssumptionCache *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); OptimizationRemarkEmitter *ORE = diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b86bf2fefbe5..98c2fcb3dae0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -55,6 +55,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" @@ -224,13 +225,21 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> { auto *PredBB = IncomingBB; auto *SuccBB = PhiBB; + SmallPtrSet<BasicBlock *, 16> Visited; while (true) { BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()); if (PredBr && PredBr->isConditional()) return {PredBB, SuccBB}; + Visited.insert(PredBB); auto *SinglePredBB = PredBB->getSinglePredecessor(); if (!SinglePredBB) return {nullptr, nullptr}; + + // Stop searching when SinglePredBB has been visited. It means we see + // an unreachable loop. + if (Visited.count(SinglePredBB)) + return {nullptr, nullptr}; + SuccBB = PredBB; PredBB = SinglePredBB; } @@ -253,7 +262,9 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { return; BasicBlock *PredBB = PredOutEdge.first; - BranchInst *PredBr = cast<BranchInst>(PredBB->getTerminator()); + BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()); + if (!PredBr) + return; uint64_t PredTrueWeight, PredFalseWeight; // FIXME: We currently only set the profile data when it is missing. @@ -286,7 +297,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) { bool JumpThreading::runOnFunction(Function &F) { if (skipFunction(F)) return false; - auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); // Get DT analysis before LVI. When LVI is initialized it conditionally adds // DT if it's available. auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -295,14 +306,13 @@ bool JumpThreading::runOnFunction(Function &F) { DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy); std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; - bool HasProfileData = F.hasProfileData(); - if (HasProfileData) { + if (F.hasProfileData()) { LoopInfo LI{DominatorTree(F)}; BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, HasProfileData, + bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(), std::move(BFI), std::move(BPI)); if (PrintLVIAfterJumpThreading) { dbgs() << "LVI for function '" << F.getName() << "':\n"; @@ -329,7 +339,7 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, HasProfileData, + bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(), std::move(BFI), std::move(BPI)); if (!Changed) @@ -992,49 +1002,8 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // successor, merge the blocks. This encourages recursive jump threading // because now the condition in this block can be threaded through // predecessors of our predecessor block. - if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - const Instruction *TI = SinglePred->getTerminator(); - if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 && - SinglePred != BB && !hasAddressTakenAndUsed(BB)) { - // If SinglePred was a loop header, BB becomes one. - if (LoopHeaders.erase(SinglePred)) - LoopHeaders.insert(BB); - - LVI->eraseBlock(SinglePred); - MergeBasicBlockIntoOnlyPred(BB, DTU); - - // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by - // BB code within one basic block `BB`), we need to invalidate the LVI - // information associated with BB, because the LVI information need not be - // true for all of BB after the merge. For example, - // Before the merge, LVI info and code is as follows: - // SinglePred: <LVI info1 for %p val> - // %y = use of %p - // call @exit() // need not transfer execution to successor. - // assume(%p) // from this point on %p is true - // br label %BB - // BB: <LVI info2 for %p val, i.e. %p is true> - // %x = use of %p - // br label exit - // - // Note that this LVI info for blocks BB and SinglPred is correct for %p - // (info2 and info1 respectively). After the merge and the deletion of the - // LVI info1 for SinglePred. We have the following code: - // BB: <LVI info2 for %p val> - // %y = use of %p - // call @exit() - // assume(%p) - // %x = use of %p <-- LVI info2 is correct from here onwards. - // br label exit - // LVI info2 for BB is incorrect at the beginning of BB. - - // Invalidate LVI information for BB if the LVI is not provably true for - // all of BB. - if (!isGuaranteedToTransferExecutionToSuccessor(BB)) - LVI->eraseBlock(BB); - return true; - } - } + if (MaybeMergeBasicBlockIntoOnlyPred(BB)) + return true; if (TryToUnfoldSelectInCurrBB(BB)) return true; @@ -1461,7 +1430,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) { "Can't handle critical edge here!"); LoadInst *NewVal = new LoadInst( LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), - LoadI->getName() + ".pr", false, LoadI->getAlignment(), + LoadI->getName() + ".pr", false, MaybeAlign(LoadI->getAlignment()), LoadI->getOrdering(), LoadI->getSyncScopeID(), UnavailablePred->getTerminator()); NewVal->setDebugLoc(LoadI->getDebugLoc()); @@ -1748,7 +1717,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, getSuccessor(GetBestDestForJumpOnUndef(BB)); // Ok, try to thread it! - return ThreadEdge(BB, PredsToFactor, MostPopularDest); + return TryThreadEdge(BB, PredsToFactor, MostPopularDest); } /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on @@ -1910,12 +1879,146 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, } } -/// ThreadEdge - We have decided that it is safe and profitable to factor the -/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB -/// across BB. Transform the IR to reflect this change. -bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, - const SmallVectorImpl<BasicBlock *> &PredBBs, - BasicBlock *SuccBB) { +/// Merge basic block BB into its sole predecessor if possible. +bool JumpThreadingPass::MaybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) { + BasicBlock *SinglePred = BB->getSinglePredecessor(); + if (!SinglePred) + return false; + + const Instruction *TI = SinglePred->getTerminator(); + if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 || + SinglePred == BB || hasAddressTakenAndUsed(BB)) + return false; + + // If SinglePred was a loop header, BB becomes one. + if (LoopHeaders.erase(SinglePred)) + LoopHeaders.insert(BB); + + LVI->eraseBlock(SinglePred); + MergeBasicBlockIntoOnlyPred(BB, DTU); + + // Now that BB is merged into SinglePred (i.e. SinglePred code followed by + // BB code within one basic block `BB`), we need to invalidate the LVI + // information associated with BB, because the LVI information need not be + // true for all of BB after the merge. For example, + // Before the merge, LVI info and code is as follows: + // SinglePred: <LVI info1 for %p val> + // %y = use of %p + // call @exit() // need not transfer execution to successor. + // assume(%p) // from this point on %p is true + // br label %BB + // BB: <LVI info2 for %p val, i.e. %p is true> + // %x = use of %p + // br label exit + // + // Note that this LVI info for blocks BB and SinglPred is correct for %p + // (info2 and info1 respectively). After the merge and the deletion of the + // LVI info1 for SinglePred. We have the following code: + // BB: <LVI info2 for %p val> + // %y = use of %p + // call @exit() + // assume(%p) + // %x = use of %p <-- LVI info2 is correct from here onwards. + // br label exit + // LVI info2 for BB is incorrect at the beginning of BB. + + // Invalidate LVI information for BB if the LVI is not provably true for + // all of BB. + if (!isGuaranteedToTransferExecutionToSuccessor(BB)) + LVI->eraseBlock(BB); + return true; +} + +/// Update the SSA form. NewBB contains instructions that are copied from BB. +/// ValueMapping maps old values in BB to new ones in NewBB. +void JumpThreadingPass::UpdateSSA( + BasicBlock *BB, BasicBlock *NewBB, + DenseMap<Instruction *, Value *> &ValueMapping) { + // If there were values defined in BB that are used outside the block, then we + // now have to update all uses of the value to use either the original value, + // the cloned value, or some PHI derived value. This can require arbitrary + // PHI insertion, of which we are prepared to do, clean these up now. + SSAUpdater SSAUpdate; + SmallVector<Use *, 16> UsesToRename; + + for (Instruction &I : *BB) { + // Scan all uses of this instruction to see if it is used outside of its + // block, and if so, record them in UsesToRename. + for (Use &U : I.uses()) { + Instruction *User = cast<Instruction>(U.getUser()); + if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(U) == BB) + continue; + } else if (User->getParent() == BB) + continue; + + UsesToRename.push_back(&U); + } + + // If there are no uses outside the block, we're done with this instruction. + if (UsesToRename.empty()) + continue; + LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); + + // We found a use of I outside of BB. Rename all uses of I that are outside + // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks + // with the two values we know. + SSAUpdate.Initialize(I.getType(), I.getName()); + SSAUpdate.AddAvailableValue(BB, &I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); + + while (!UsesToRename.empty()) + SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); + LLVM_DEBUG(dbgs() << "\n"); + } +} + +/// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone +/// arguments that come from PredBB. Return the map from the variables in the +/// source basic block to the variables in the newly created basic block. +DenseMap<Instruction *, Value *> +JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI, + BasicBlock::iterator BE, BasicBlock *NewBB, + BasicBlock *PredBB) { + // We are going to have to map operands from the source basic block to the new + // copy of the block 'NewBB'. If there are PHI nodes in the source basic + // block, evaluate them to account for entry from PredBB. + DenseMap<Instruction *, Value *> ValueMapping; + + // Clone the phi nodes of the source basic block into NewBB. The resulting + // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater + // might need to rewrite the operand of the cloned phi. + for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { + PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); + NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); + ValueMapping[PN] = NewPN; + } + + // Clone the non-phi instructions of the source basic block into NewBB, + // keeping track of the mapping and using it to remap operands in the cloned + // instructions. + for (; BI != BE; ++BI) { + Instruction *New = BI->clone(); + New->setName(BI->getName()); + NewBB->getInstList().push_back(New); + ValueMapping[&*BI] = New; + + // Remap operands to patch up intra-block references. + for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) + if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { + DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst); + if (I != ValueMapping.end()) + New->setOperand(i, I->second); + } + } + + return ValueMapping; +} + +/// TryThreadEdge - Thread an edge if it's safe and profitable to do so. +bool JumpThreadingPass::TryThreadEdge( + BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs, + BasicBlock *SuccBB) { // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName() @@ -1945,6 +2048,21 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, return false; } + ThreadEdge(BB, PredBBs, SuccBB); + return true; +} + +/// ThreadEdge - We have decided that it is safe and profitable to factor the +/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB +/// across BB. Transform the IR to reflect this change. +void JumpThreadingPass::ThreadEdge(BasicBlock *BB, + const SmallVectorImpl<BasicBlock *> &PredBBs, + BasicBlock *SuccBB) { + assert(SuccBB != BB && "Don't create an infinite loop"); + + assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) && + "Don't thread across loop headers"); + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) @@ -1958,7 +2076,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // And finally, do it! LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" << SuccBB->getName() - << "' with cost: " << JumpThreadCost << ", across block:\n " << *BB << "\n"); if (DTU->hasPendingDomTreeUpdates()) @@ -1967,11 +2084,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, LVI->enableDT(); LVI->threadEdge(PredBB, BB, SuccBB); - // We are going to have to map operands from the original BB block to the new - // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to - // account for entry from PredBB. - DenseMap<Instruction*, Value*> ValueMapping; - BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+".thread", BB->getParent(), BB); @@ -1984,32 +2096,9 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); } - BasicBlock::iterator BI = BB->begin(); - // Clone the phi nodes of BB into NewBB. The resulting phi nodes are trivial, - // since NewBB only has one predecessor, but SSAUpdater might need to rewrite - // the operand of the cloned phi. - for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { - PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB); - NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB); - ValueMapping[PN] = NewPN; - } - - // Clone the non-phi instructions of BB into NewBB, keeping track of the - // mapping and using it to remap operands in the cloned instructions. - for (; !BI->isTerminator(); ++BI) { - Instruction *New = BI->clone(); - New->setName(BI->getName()); - NewBB->getInstList().push_back(New); - ValueMapping[&*BI] = New; - - // Remap operands to patch up intra-block references. - for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) - if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { - DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst); - if (I != ValueMapping.end()) - New->setOperand(i, I->second); - } - } + // Copy all the instructions from BB to NewBB except the terminator. + DenseMap<Instruction *, Value *> ValueMapping = + CloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB); // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. @@ -2035,44 +2124,7 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, {DominatorTree::Insert, PredBB, NewBB}, {DominatorTree::Delete, PredBB, BB}}); - // If there were values defined in BB that are used outside the block, then we - // now have to update all uses of the value to use either the original value, - // the cloned value, or some PHI derived value. This can require arbitrary - // PHI insertion, of which we are prepared to do, clean these up now. - SSAUpdater SSAUpdate; - SmallVector<Use*, 16> UsesToRename; - - for (Instruction &I : *BB) { - // Scan all uses of this instruction to see if their uses are no longer - // dominated by the previous def and if so, record them in UsesToRename. - // Also, skip phi operands from PredBB - we'll remove them anyway. - for (Use &U : I.uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(U) == BB) - continue; - } else if (User->getParent() == BB) - continue; - - UsesToRename.push_back(&U); - } - - // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty()) - continue; - LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); - - // We found a use of I outside of BB. Rename all uses of I that are outside - // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks - // with the two values we know. - SSAUpdate.Initialize(I.getType(), I.getName()); - SSAUpdate.AddAvailableValue(BB, &I); - SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]); - - while (!UsesToRename.empty()) - SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - LLVM_DEBUG(dbgs() << "\n"); - } + UpdateSSA(BB, NewBB, ValueMapping); // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This @@ -2084,7 +2136,6 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, // Threaded an edge! ++NumThreads; - return true; } /// Create a new basic block that will be the predecessor of BB and successor of @@ -2356,43 +2407,7 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB, ValueMapping); - // If there were values defined in BB that are used outside the block, then we - // now have to update all uses of the value to use either the original value, - // the cloned value, or some PHI derived value. This can require arbitrary - // PHI insertion, of which we are prepared to do, clean these up now. - SSAUpdater SSAUpdate; - SmallVector<Use*, 16> UsesToRename; - for (Instruction &I : *BB) { - // Scan all uses of this instruction to see if it is used outside of its - // block, and if so, record them in UsesToRename. - for (Use &U : I.uses()) { - Instruction *User = cast<Instruction>(U.getUser()); - if (PHINode *UserPN = dyn_cast<PHINode>(User)) { - if (UserPN->getIncomingBlock(U) == BB) - continue; - } else if (User->getParent() == BB) - continue; - - UsesToRename.push_back(&U); - } - - // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty()) - continue; - - LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); - - // We found a use of I outside of BB. Rename all uses of I that are outside - // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks - // with the two values we know. - SSAUpdate.Initialize(I.getType(), I.getName()); - SSAUpdate.AddAvailableValue(BB, &I); - SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&I]); - - while (!UsesToRename.empty()) - SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - LLVM_DEBUG(dbgs() << "\n"); - } + UpdateSSA(BB, PredBB, ValueMapping); // PredBB no longer jumps to BB, remove entries in the PHI node for the edge // that we nuked. @@ -2423,7 +2438,7 @@ void JumpThreadingPass::UnfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB, // |----- // v // BB - BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator()); + BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator()); BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold", BB->getParent(), BB); // Move the unconditional branch to NewBB. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp index d9dda4cef2d2..8c33045c2380 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LICM.cpp @@ -63,6 +63,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/PredIteratorCache.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -137,7 +138,8 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop, TargetTransformInfo *TTI, bool &FreeInLoop); static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); + MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE); static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); @@ -162,7 +164,7 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU); + MemorySSAUpdater *MSSAU, ScalarEvolution *SE); namespace { struct LoopInvariantCodeMotion { @@ -220,7 +222,8 @@ struct LegacyLICMPass : public LoopPass { &getAnalysis<AAResultsWrapperPass>().getAAResults(), &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(), &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()), &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *L->getHeader()->getParent()), SE ? &SE->getSE() : nullptr, MSSA, &ORE, false); @@ -294,7 +297,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, PA.preserve<DominatorTreeAnalysis>(); PA.preserve<LoopAnalysis>(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; @@ -330,6 +333,12 @@ bool LoopInvariantCodeMotion::runOnLoop( assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form."); + // If this loop has metadata indicating that LICM is not to be performed then + // just exit. + if (hasDisableLICMTransformsHint(L)) { + return false; + } + std::unique_ptr<AliasSetTracker> CurAST; std::unique_ptr<MemorySSAUpdater> MSSAU; bool NoOfMemAccTooLarge = false; @@ -340,7 +349,7 @@ bool LoopInvariantCodeMotion::runOnLoop( CurAST = collectAliasInfoForLoop(L, LI, AA); } else { LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n"); - MSSAU = make_unique<MemorySSAUpdater>(MSSA); + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); unsigned AccessCapCount = 0; for (auto *BB : L->getBlocks()) { @@ -383,8 +392,9 @@ bool LoopInvariantCodeMotion::runOnLoop( CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); Flags.IsSink = false; if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, - CurAST.get(), MSSAU.get(), &SafetyInfo, Flags, ORE); + Changed |= + hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L, + CurAST.get(), MSSAU.get(), SE, &SafetyInfo, Flags, ORE); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -780,6 +790,41 @@ public: }; } // namespace + +/// Return true if we know how to rewrite all uses of the given alloca after +/// hoisting it out of the loop. The main concerns are a) potential captures +/// and b) invariant.start markers which don't capture, but are no longer +/// valid w/o a corresponding invariant.end. +static bool canRewriteUsesOfAlloca(AllocaInst &AI) { + // TODO: This looks a lot like capture tracking, but we need to remove any + // invariant starts if we extend the lifetime of the alloca by hoisting it. + // We should probably refactor capture tracking into a form which allows us + // to reuse the relevant bits and remove the duplicated logic here. + + SmallVector<Use *, 16> Worklist; + for (Use &U : AI.uses()) + Worklist.push_back(&U); + + unsigned NumUsesExplored = 0; + while (!Worklist.empty()) { + Use *U = Worklist.pop_back_val(); + Instruction *I = cast<Instruction>(U->getUser()); + NumUsesExplored++; + if (NumUsesExplored > DefaultMaxUsesToExplore) + return false; + // Non capturing, terminating uses + if (isa<LoadInst>(I) || + (isa<StoreInst>(I) && U->getOperandNo() == 1)) + continue; + // Non capturing, non-terminating + if (!isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I)) + return false; + for (Use &U : I->uses()) + Worklist.push_back(&U); + } + return true; +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -788,7 +833,7 @@ public: bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU, - ICFLoopSafetyInfo *SafetyInfo, + ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) { // Verify inputs. @@ -848,7 +893,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, I, DT, CurLoop, SafetyInfo, ORE, CurLoop->getLoopPreheader()->getTerminator())) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); HoistedInstructions.push_back(&I); Changed = true; continue; @@ -875,7 +920,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, eraseInstruction(I, *SafetyInfo, CurAST, MSSAU); hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), - SafetyInfo, MSSAU, ORE); + SafetyInfo, MSSAU, SE, ORE); HoistedInstructions.push_back(ReciprocalDivisor); Changed = true; continue; @@ -894,7 +939,17 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, CurLoop->hasLoopInvariantOperands(&I) && MustExecuteWithoutWritesBefore(I)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); + HoistedInstructions.push_back(&I); + Changed = true; + continue; + } + + if (isa<AllocaInst>(&I) && + SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && + canRewriteUsesOfAlloca(cast<AllocaInst>(I))) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); HoistedInstructions.push_back(&I); Changed = true; continue; @@ -908,7 +963,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, PN->setIncomingBlock( i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, ORE); + MSSAU, SE, ORE); assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); Changed = true; continue; @@ -945,7 +1000,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, LLVM_DEBUG(dbgs() << "LICM rehoisting to " << HoistPoint->getParent()->getName() << ": " << *I << "\n"); - moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU); + moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE); HoistPoint = I; Changed = true; } @@ -956,7 +1011,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, // Now that we've finished hoisting make sure that LI and DT are still // valid. -#ifndef NDEBUG +#ifdef EXPENSIVE_CHECKS if (Changed) { assert(DT->verify(DominatorTree::VerificationLevel::Fast) && "Dominator tree verification failed"); @@ -1026,7 +1081,8 @@ namespace { bool isHoistableAndSinkableInst(Instruction &I) { // Only these instructions are hoistable/sinkable. return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) || - isa<FenceInst>(I) || isa<BinaryOperator>(I) || isa<CastInst>(I) || + isa<FenceInst>(I) || isa<CastInst>(I) || + isa<UnaryOperator>(I) || isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) || isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) || isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) || @@ -1092,7 +1148,7 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // in the same alias set as something that ends up being modified. if (AA->pointsToConstantMemory(LI->getOperand(0))) return true; - if (LI->getMetadata(LLVMContext::MD_invariant_load)) + if (LI->hasMetadata(LLVMContext::MD_invariant_load)) return true; if (LI->isAtomic() && !TargetExecutesOncePerLoop) @@ -1134,6 +1190,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // Assumes don't actually alias anything or throw return true; + if (match(CI, m_Intrinsic<Intrinsic::experimental_widenable_condition>())) + // Widenable conditions don't actually alias anything or throw + return true; + // Handle simple cases by querying alias analysis. FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI); if (Behavior == FMRB_DoesNotAccessMemory) @@ -1240,12 +1300,22 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, // FIXME: More precise: no Uses that alias SI. if (!Flags->IsSink && !MSSA->dominates(SIMD, MU)) return false; - } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) + } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) { if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) { (void)LI; // Silence warning. assert(!LI->isUnordered() && "Expected unordered load"); return false; } + // Any call, while it may not be clobbering SI, it may be a use. + if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) { + // Check if the call may read from the memory locattion written + // to by SI. Check CI's attributes and arguments; the number of + // such checks performed is limited above by NoOfMemAccTooLarge. + ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI)); + if (isModOrRefSet(MRI)) + return false; + } + } } auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI); @@ -1375,8 +1445,7 @@ static Instruction *CloneInstructionInExitBlock( if (!I.getName().empty()) New->setName(I.getName() + ".le"); - MemoryAccess *OldMemAcc; - if (MSSAU && (OldMemAcc = MSSAU->getMemorySSA()->getMemoryAccess(&I))) { + if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) { // Create a new MemoryAccess and let MemorySSA set its defining access. MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB( New, nullptr, New->getParent(), MemorySSA::Beginning); @@ -1385,7 +1454,7 @@ static Instruction *CloneInstructionInExitBlock( MSSAU->insertDef(MemDef, /*RenameUses=*/true); else { auto *MemUse = cast<MemoryUse>(NewMemAcc); - MSSAU->insertUse(MemUse); + MSSAU->insertUse(MemUse, /*RenameUses=*/true); } } } @@ -1424,14 +1493,18 @@ static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, static void moveInstructionBefore(Instruction &I, Instruction &Dest, ICFLoopSafetyInfo &SafetyInfo, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, + ScalarEvolution *SE) { SafetyInfo.removeInstruction(&I); SafetyInfo.insertInstructionTo(&I, Dest.getParent()); I.moveBefore(&Dest); if (MSSAU) if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( MSSAU->getMemorySSA()->getMemoryAccess(&I))) - MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), MemorySSA::End); + MSSAU->moveToPlace(OldMemAcc, Dest.getParent(), + MemorySSA::BeforeTerminator); + if (SE) + SE->forgetValue(&I); } static Instruction *sinkThroughTriviallyReplaceablePHI( @@ -1645,7 +1718,8 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT, /// static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo, - MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE) { + MemorySSAUpdater *MSSAU, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE) { LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getName() << ": " << I << "\n"); ORE->emit([&]() { @@ -1666,10 +1740,10 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, if (isa<PHINode>(I)) // Move the new node to the end of the phi list in the destination block. - moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU); + moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU, SE); else // Move the new node to the destination block, before its terminator. - moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU); + moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE); // Apply line 0 debug locations when we are moving instructions to different // basic blocks because we want to avoid jumpy line tables. @@ -1783,7 +1857,7 @@ public: StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); if (UnorderedAtomic) NewSI->setOrdering(AtomicOrdering::Unordered); - NewSI->setAlignment(Alignment); + NewSI->setAlignment(MaybeAlign(Alignment)); NewSI->setDebugLoc(DL); if (AATags) NewSI->setAAMetadata(AATags); @@ -2016,7 +2090,8 @@ bool llvm::promoteLoopAccessesToScalars( if (!DereferenceableInPH) { DereferenceableInPH = isDereferenceableAndAlignedPointer( Store->getPointerOperand(), Store->getValueOperand()->getType(), - Store->getAlignment(), MDL, Preheader->getTerminator(), DT); + MaybeAlign(Store->getAlignment()), MDL, + Preheader->getTerminator(), DT); } } else return false; // Not a load or store. @@ -2101,20 +2176,21 @@ bool llvm::promoteLoopAccessesToScalars( SomePtr->getName() + ".promoted", Preheader->getTerminator()); if (SawUnorderedAtomic) PreheaderLoad->setOrdering(AtomicOrdering::Unordered); - PreheaderLoad->setAlignment(Alignment); + PreheaderLoad->setAlignment(MaybeAlign(Alignment)); PreheaderLoad->setDebugLoc(DL); if (AATags) PreheaderLoad->setAAMetadata(AATags); SSA.AddAvailableValue(Preheader, PreheaderLoad); - MemoryAccess *PreheaderLoadMemoryAccess; if (MSSAU) { - PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( + MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB( PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End); MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess); - MSSAU->insertUse(NewMemUse); + MSSAU->insertUse(NewMemUse, /*RenameUses=*/true); } + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); // Rewrite all the loads in the loop and remember all the definitions from // stores in the loop. Promoter.run(LoopUses); @@ -2161,7 +2237,7 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI, LoopToAliasSetMap.erase(MapI); } if (!CurAST) - CurAST = make_unique<AliasSetTracker>(*AA); + CurAST = std::make_unique<AliasSetTracker>(*AA); // Add everything from the sub loops that are no longer directly available. for (Loop *InnerL : RecomputeLoops) @@ -2180,7 +2256,7 @@ std::unique_ptr<AliasSetTracker> LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA( Loop *L, AliasAnalysis *AA, MemorySSAUpdater *MSSAU) { auto *MSSA = MSSAU->getMemorySSA(); - auto CurAST = make_unique<AliasSetTracker>(*AA, MSSA, L); + auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L); CurAST->addAllInstructionsInLoopUsingMSSA(); return CurAST; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index 1fcf1315a177..ab65f56d088f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/LoopDataPrefetch.h" +#include "llvm/InitializePasses.h" #define DEBUG_TYPE "loop-data-prefetch" #include "llvm/ADT/DepthFirstIterator.h" @@ -312,8 +313,8 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { IRBuilder<> Builder(MemI); Module *M = BB->getParent()->getParent(); Type *I32 = Type::getInt32Ty(BB->getContext()); - Function *PrefetchFunc = - Intrinsic::getDeclaration(M, Intrinsic::prefetch); + Function *PrefetchFunc = Intrinsic::getDeclaration( + M, Intrinsic::prefetch, PrefPtrValue->getType()); Builder.CreateCall( PrefetchFunc, {PrefPtrValue, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 8371367e24e7..2451572d6171 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -191,7 +192,7 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. - const SCEV *S = SE.getMaxBackedgeTakenCount(L); + const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) { LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n"); return Changed ? LoopDeletionResult::Modified diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index f45e5fd0f50b..8e04e6e0ffe8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -55,6 +55,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 0bc2bcff2ae1..e1738f08eb23 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -55,18 +55,21 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/Function.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/CodeMoverUtils.h" using namespace llvm; #define DEBUG_TYPE "loop-fusion" -STATISTIC(FuseCounter, "Count number of loop fusions performed"); +STATISTIC(FuseCounter, "Loops fused"); STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion"); STATISTIC(InvalidPreheader, "Loop has invalid preheader"); STATISTIC(InvalidHeader, "Loop has invalid header"); @@ -79,12 +82,16 @@ STATISTIC(MayThrowException, "Loop may throw an exception"); STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access"); STATISTIC(NotSimplifiedForm, "Loop is not in simplified form"); STATISTIC(InvalidDependencies, "Dependencies prevent fusion"); -STATISTIC(InvalidTripCount, - "Loop does not have invariant backedge taken count"); +STATISTIC(UnknownTripCount, "Loop has unknown trip count"); STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop"); -STATISTIC(NonEqualTripCount, "Candidate trip counts are not the same"); -STATISTIC(NonAdjacent, "Candidates are not adjacent"); -STATISTIC(NonEmptyPreheader, "Candidate has a non-empty preheader"); +STATISTIC(NonEqualTripCount, "Loop trip counts are not the same"); +STATISTIC(NonAdjacent, "Loops are not adjacent"); +STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader"); +STATISTIC(FusionNotBeneficial, "Fusion is not beneficial"); +STATISTIC(NonIdenticalGuards, "Candidates have different guards"); +STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block"); +STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block"); +STATISTIC(NotRotated, "Candidate is not rotated"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -110,6 +117,7 @@ static cl::opt<bool> cl::Hidden, cl::init(false), cl::ZeroOrMore); #endif +namespace { /// This class is used to represent a candidate for loop fusion. When it is /// constructed, it checks the conditions for loop fusion to ensure that it /// represents a valid candidate. It caches several parts of a loop that are @@ -143,6 +151,8 @@ struct FusionCandidate { SmallVector<Instruction *, 16> MemWrites; /// Are all of the members of this fusion candidate still valid bool Valid; + /// Guard branch of the loop, if it exists + BranchInst *GuardBranch; /// Dominator and PostDominator trees are needed for the /// FusionCandidateCompare function, required by FusionCandidateSet to @@ -151,11 +161,14 @@ struct FusionCandidate { const DominatorTree *DT; const PostDominatorTree *PDT; + OptimizationRemarkEmitter &ORE; + FusionCandidate(Loop *L, const DominatorTree *DT, - const PostDominatorTree *PDT) + const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE) : Preheader(L->getLoopPreheader()), Header(L->getHeader()), ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()), - Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT) { + Latch(L->getLoopLatch()), L(L), Valid(true), + GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) { // Walk over all blocks in the loop and check for conditions that may // prevent fusion. For each block, walk over all instructions and collect @@ -163,28 +176,28 @@ struct FusionCandidate { // found, invalidate this object and return. for (BasicBlock *BB : L->blocks()) { if (BB->hasAddressTaken()) { - AddressTakenBB++; invalidate(); + reportInvalidCandidate(AddressTakenBB); return; } for (Instruction &I : *BB) { if (I.mayThrow()) { - MayThrowException++; invalidate(); + reportInvalidCandidate(MayThrowException); return; } if (StoreInst *SI = dyn_cast<StoreInst>(&I)) { if (SI->isVolatile()) { - ContainsVolatileAccess++; invalidate(); + reportInvalidCandidate(ContainsVolatileAccess); return; } } if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { if (LI->isVolatile()) { - ContainsVolatileAccess++; invalidate(); + reportInvalidCandidate(ContainsVolatileAccess); return; } } @@ -214,19 +227,100 @@ struct FusionCandidate { assert(Latch == L->getLoopLatch() && "Latch is out of sync"); } + /// Get the entry block for this fusion candidate. + /// + /// If this fusion candidate represents a guarded loop, the entry block is the + /// loop guard block. If it represents an unguarded loop, the entry block is + /// the preheader of the loop. + BasicBlock *getEntryBlock() const { + if (GuardBranch) + return GuardBranch->getParent(); + else + return Preheader; + } + + /// Given a guarded loop, get the successor of the guard that is not in the + /// loop. + /// + /// This method returns the successor of the loop guard that is not located + /// within the loop (i.e., the successor of the guard that is not the + /// preheader). + /// This method is only valid for guarded loops. + BasicBlock *getNonLoopBlock() const { + assert(GuardBranch && "Only valid on guarded loops."); + assert(GuardBranch->isConditional() && + "Expecting guard to be a conditional branch."); + return (GuardBranch->getSuccessor(0) == Preheader) + ? GuardBranch->getSuccessor(1) + : GuardBranch->getSuccessor(0); + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void dump() const { - dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") + dbgs() << "\tGuardBranch: "; + if (GuardBranch) + dbgs() << *GuardBranch; + else + dbgs() << "nullptr"; + dbgs() << "\n" + << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n" + << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr") << "\n" << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n" << "\tExitingBB: " << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n" << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr") << "\n" - << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"; + << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n" + << "\tEntryBlock: " + << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr") + << "\n"; } #endif + /// Determine if a fusion candidate (representing a loop) is eligible for + /// fusion. Note that this only checks whether a single loop can be fused - it + /// does not check whether it is *legal* to fuse two loops together. + bool isEligibleForFusion(ScalarEvolution &SE) const { + if (!isValid()) { + LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n"); + if (!Preheader) + ++InvalidPreheader; + if (!Header) + ++InvalidHeader; + if (!ExitingBlock) + ++InvalidExitingBlock; + if (!ExitBlock) + ++InvalidExitBlock; + if (!Latch) + ++InvalidLatch; + if (L->isInvalid()) + ++InvalidLoop; + + return false; + } + + // Require ScalarEvolution to be able to determine a trip count. + if (!SE.hasLoopInvariantBackedgeTakenCount(L)) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " trip count not computable!\n"); + return reportInvalidCandidate(UnknownTripCount); + } + + if (!L->isLoopSimplifyForm()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " is not in simplified form!\n"); + return reportInvalidCandidate(NotSimplifiedForm); + } + + if (!L->isRotatedForm()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n"); + return reportInvalidCandidate(NotRotated); + } + + return true; + } + private: // This is only used internally for now, to clear the MemWrites and MemReads // list and setting Valid to false. I can't envision other uses of this right @@ -239,17 +333,18 @@ private: MemReads.clear(); Valid = false; } -}; - -inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const FusionCandidate &FC) { - if (FC.isValid()) - OS << FC.Preheader->getName(); - else - OS << "<Invalid>"; - return OS; -} + bool reportInvalidCandidate(llvm::Statistic &Stat) const { + using namespace ore; + assert(L && Preheader && "Fusion candidate not initialized properly!"); + ++Stat; + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(), + L->getStartLoc(), Preheader) + << "[" << Preheader->getParent()->getName() << "]: " + << "Loop is not a candidate for fusion: " << Stat.getDesc()); + return false; + } +}; struct FusionCandidateCompare { /// Comparison functor to sort two Control Flow Equivalent fusion candidates @@ -260,21 +355,24 @@ struct FusionCandidateCompare { const FusionCandidate &RHS) const { const DominatorTree *DT = LHS.DT; + BasicBlock *LHSEntryBlock = LHS.getEntryBlock(); + BasicBlock *RHSEntryBlock = RHS.getEntryBlock(); + // Do not save PDT to local variable as it is only used in asserts and thus // will trigger an unused variable warning if building without asserts. assert(DT && LHS.PDT && "Expecting valid dominator tree"); // Do this compare first so if LHS == RHS, function returns false. - if (DT->dominates(RHS.Preheader, LHS.Preheader)) { + if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) { // RHS dominates LHS // Verify LHS post-dominates RHS - assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader)); + assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock)); return false; } - if (DT->dominates(LHS.Preheader, RHS.Preheader)) { + if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) { // Verify RHS Postdominates LHS - assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader)); + assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock)); return true; } @@ -286,7 +384,6 @@ struct FusionCandidateCompare { } }; -namespace { using LoopVector = SmallVector<Loop *, 4>; // Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance @@ -301,17 +398,26 @@ using LoopVector = SmallVector<Loop *, 4>; // keeps the FusionCandidateSet sorted will also simplify the implementation. using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>; using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>; -} // namespace -inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, +#if !defined(NDEBUG) +static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const FusionCandidate &FC) { + if (FC.isValid()) + OS << FC.Preheader->getName(); + else + OS << "<Invalid>"; + + return OS; +} + +static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FusionCandidateSet &CandSet) { - for (auto IT : CandSet) - OS << IT << "\n"; + for (const FusionCandidate &FC : CandSet) + OS << FC << '\n'; return OS; } -#if !defined(NDEBUG) static void printFusionCandidates(const FusionCandidateCollection &FusionCandidates) { dbgs() << "Fusion Candidates: \n"; @@ -391,16 +497,6 @@ static void printLoopVector(const LoopVector &LV) { } #endif -static void reportLoopFusion(const FusionCandidate &FC0, - const FusionCandidate &FC1, - OptimizationRemarkEmitter &ORE) { - using namespace ore; - ORE.emit( - OptimizationRemark(DEBUG_TYPE, "LoopFusion", FC0.Preheader->getParent()) - << "Fused " << NV("Cand1", StringRef(FC0.Preheader->getName())) - << " with " << NV("Cand2", StringRef(FC1.Preheader->getName()))); -} - struct LoopFuser { private: // Sets of control flow equivalent fusion candidates for a given nest level. @@ -497,53 +593,8 @@ private: const FusionCandidate &FC1) const { assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders"); - if (DT.dominates(FC0.Preheader, FC1.Preheader)) - return PDT.dominates(FC1.Preheader, FC0.Preheader); - - if (DT.dominates(FC1.Preheader, FC0.Preheader)) - return PDT.dominates(FC0.Preheader, FC1.Preheader); - - return false; - } - - /// Determine if a fusion candidate (representing a loop) is eligible for - /// fusion. Note that this only checks whether a single loop can be fused - it - /// does not check whether it is *legal* to fuse two loops together. - bool eligibleForFusion(const FusionCandidate &FC) const { - if (!FC.isValid()) { - LLVM_DEBUG(dbgs() << "FC " << FC << " has invalid CFG requirements!\n"); - if (!FC.Preheader) - InvalidPreheader++; - if (!FC.Header) - InvalidHeader++; - if (!FC.ExitingBlock) - InvalidExitingBlock++; - if (!FC.ExitBlock) - InvalidExitBlock++; - if (!FC.Latch) - InvalidLatch++; - if (FC.L->isInvalid()) - InvalidLoop++; - - return false; - } - - // Require ScalarEvolution to be able to determine a trip count. - if (!SE.hasLoopInvariantBackedgeTakenCount(FC.L)) { - LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName() - << " trip count not computable!\n"); - InvalidTripCount++; - return false; - } - - if (!FC.L->isLoopSimplifyForm()) { - LLVM_DEBUG(dbgs() << "Loop " << FC.L->getName() - << " is not in simplified form!\n"); - NotSimplifiedForm++; - return false; - } - - return true; + return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(), + DT, PDT); } /// Iterate over all loops in the given loop set and identify the loops that @@ -551,8 +602,8 @@ private: /// Flow Equivalent sets, sorted by dominance. void collectFusionCandidates(const LoopVector &LV) { for (Loop *L : LV) { - FusionCandidate CurrCand(L, &DT, &PDT); - if (!eligibleForFusion(CurrCand)) + FusionCandidate CurrCand(L, &DT, &PDT, ORE); + if (!CurrCand.isEligibleForFusion(SE)) continue; // Go through each list in FusionCandidates and determine if L is control @@ -664,31 +715,64 @@ private: if (!identicalTripCounts(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip " "counts. Not fusing.\n"); - NonEqualTripCount++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEqualTripCount); continue; } if (!isAdjacent(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidates are not adjacent. Not fusing.\n"); - NonAdjacent++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent); + continue; + } + + // Ensure that FC0 and FC1 have identical guards. + // If one (or both) are not guarded, this check is not necessary. + if (FC0->GuardBranch && FC1->GuardBranch && + !haveIdenticalGuards(*FC0, *FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical " + "guards. Not Fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonIdenticalGuards); continue; } - // For now we skip fusing if the second candidate has any instructions - // in the preheader. This is done because we currently do not have the - // safety checks to determine if it is save to move the preheader of - // the second candidate past the body of the first candidate. Once - // these checks are added, this condition can be removed. + // The following three checks look for empty blocks in FC0 and FC1. If + // any of these blocks are non-empty, we do not fuse. This is done + // because we currently do not have the safety checks to determine if + // it is safe to move the blocks past other blocks in the loop. Once + // these checks are added, these conditions can be relaxed. if (!isEmptyPreheader(*FC1)) { LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty " "preheader. Not fusing.\n"); - NonEmptyPreheader++; + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyPreheader); + continue; + } + + if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit " + "block. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyExitBlock); + continue; + } + + if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) { + LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard " + "block. Not fusing.\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + NonEmptyGuardBlock); continue; } + // Check the dependencies across the loops and do not fuse if it would + // violate them. if (!dependencesAllowFusion(*FC0, *FC1)) { LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n"); + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + InvalidDependencies); continue; } @@ -696,9 +780,11 @@ private: LLVM_DEBUG(dbgs() << "\tFusion appears to be " << (BeneficialToFuse ? "" : "un") << "profitable!\n"); - if (!BeneficialToFuse) + if (!BeneficialToFuse) { + reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, + FusionNotBeneficial); continue; - + } // All analysis has completed and has determined that fusion is legal // and profitable. At this point, start transforming the code and // perform fusion. @@ -710,15 +796,14 @@ private: // Note this needs to be done *before* performFusion because // performFusion will change the original loops, making it not // possible to identify them after fusion is complete. - reportLoopFusion(*FC0, *FC1, ORE); + reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter); - FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT); + FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE); FusedCand.verify(); - assert(eligibleForFusion(FusedCand) && + assert(FusedCand.isEligibleForFusion(SE) && "Fused candidate should be eligible for fusion!"); // Notify the loop-depth-tree that these loops are not valid objects - // anymore. LDT.removeLoop(FC1->L); CandidateSet.erase(FC0); @@ -889,7 +974,7 @@ private: LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1 << "\n"); assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth()); - assert(DT.dominates(FC0.Preheader, FC1.Preheader)); + assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock())); for (Instruction *WriteL0 : FC0.MemWrites) { for (Instruction *WriteL1 : FC1.MemWrites) @@ -939,18 +1024,112 @@ private: return true; } - /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this - /// case, there is no code in between the two fusion candidates, thus making - /// them adjacent. + /// Determine if two fusion candidates are adjacent in the CFG. + /// + /// This method will determine if there are additional basic blocks in the CFG + /// between the exit of \p FC0 and the entry of \p FC1. + /// If the two candidates are guarded loops, then it checks whether the + /// non-loop successor of the \p FC0 guard branch is the entry block of \p + /// FC1. If not, then the loops are not adjacent. If the two candidates are + /// not guarded loops, then it checks whether the exit block of \p FC0 is the + /// preheader of \p FC1. bool isAdjacent(const FusionCandidate &FC0, const FusionCandidate &FC1) const { - return FC0.ExitBlock == FC1.Preheader; + // If the successor of the guard branch is FC1, then the loops are adjacent + if (FC0.GuardBranch) + return FC0.getNonLoopBlock() == FC1.getEntryBlock(); + else + return FC0.ExitBlock == FC1.getEntryBlock(); + } + + /// Determine if two fusion candidates have identical guards + /// + /// This method will determine if two fusion candidates have the same guards. + /// The guards are considered the same if: + /// 1. The instructions to compute the condition used in the compare are + /// identical. + /// 2. The successors of the guard have the same flow into/around the loop. + /// If the compare instructions are identical, then the first successor of the + /// guard must go to the same place (either the preheader of the loop or the + /// NonLoopBlock). In other words, the the first successor of both loops must + /// both go into the loop (i.e., the preheader) or go around the loop (i.e., + /// the NonLoopBlock). The same must be true for the second successor. + bool haveIdenticalGuards(const FusionCandidate &FC0, + const FusionCandidate &FC1) const { + assert(FC0.GuardBranch && FC1.GuardBranch && + "Expecting FC0 and FC1 to be guarded loops."); + + if (auto FC0CmpInst = + dyn_cast<Instruction>(FC0.GuardBranch->getCondition())) + if (auto FC1CmpInst = + dyn_cast<Instruction>(FC1.GuardBranch->getCondition())) + if (!FC0CmpInst->isIdenticalTo(FC1CmpInst)) + return false; + + // The compare instructions are identical. + // Now make sure the successor of the guards have the same flow into/around + // the loop + if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader) + return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader); + else + return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader); + } + + /// Check that the guard for \p FC *only* contains the cmp/branch for the + /// guard. + /// Once we are able to handle intervening code, any code in the guard block + /// for FC1 will need to be treated as intervening code and checked whether + /// it can safely move around the loops. + bool isEmptyGuardBlock(const FusionCandidate &FC) const { + assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch."); + if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) { + auto *GuardBlock = FC.GuardBranch->getParent(); + // If the generation of the cmp value is in GuardBlock, then the size of + // the guard block should be 2 (cmp + branch). If the generation of the + // cmp value is in a different block, then the size of the guard block + // should only be 1. + if (CmpInst->getParent() == GuardBlock) + return GuardBlock->size() == 2; + else + return GuardBlock->size() == 1; + } + + return false; } bool isEmptyPreheader(const FusionCandidate &FC) const { + assert(FC.Preheader && "Expecting a valid preheader"); return FC.Preheader->size() == 1; } + bool isEmptyExitBlock(const FusionCandidate &FC) const { + assert(FC.ExitBlock && "Expecting a valid exit block"); + return FC.ExitBlock->size() == 1; + } + + /// Simplify the condition of the latch branch of \p FC to true, when both of + /// its successors are the same. + void simplifyLatchBranch(const FusionCandidate &FC) const { + BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator()); + if (FCLatchBranch) { + assert(FCLatchBranch->isConditional() && + FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) && + "Expecting the two successors of FCLatchBranch to be the same"); + FCLatchBranch->setCondition( + llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType())); + } + } + + /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique + /// successor, then merge FC0.Latch with its unique successor. + void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) { + moveInstsBottomUp(*FC0.Latch, *FC1.Latch, DT, PDT, DI); + if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) { + MergeBlockIntoPredecessor(Succ, &DTU, &LI); + DTU.flush(); + } + } + /// Fuse two fusion candidates, creating a new fused loop. /// /// This method contains the mechanics of fusing two loops, represented by \p @@ -987,6 +1166,12 @@ private: LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump(); dbgs() << "Fusion Candidate 1: \n"; FC1.dump();); + // Fusing guarded loops is handled slightly differently than non-guarded + // loops and has been broken out into a separate method instead of trying to + // intersperse the logic within a single method. + if (FC0.GuardBranch) + return fuseGuardedLoops(FC0, FC1); + assert(FC1.Preheader == FC0.ExitBlock); assert(FC1.Preheader->size() == 1 && FC1.Preheader->getSingleSuccessor() == FC1.Header); @@ -1078,6 +1263,10 @@ private: FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + // Change the condition of FC0 latch branch to true, as both successors of + // the branch are the same. + simplifyLatchBranch(FC0); + // If FC0.Latch and FC0.ExitingBlock are the same then we have already // performed the updates above. if (FC0.Latch != FC0.ExitingBlock) @@ -1100,9 +1289,15 @@ private: // Is there a way to keep SE up-to-date so we don't need to forget the loops // and rebuild the information in subsequent passes of fusion? + // Note: Need to forget the loops before merging the loop latches, as + // mergeLatch may remove the only block in FC1. SE.forgetLoop(FC1.L); SE.forgetLoop(FC0.L); + // Move instructions from FC0.Latch to FC1.Latch. + // Note: mergeLatch requires an updated DT. + mergeLatch(FC0, FC1); + // Merge the loops. SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), FC1.L->block_end()); @@ -1131,7 +1326,268 @@ private: SE.verify(); #endif - FuseCounter++; + LLVM_DEBUG(dbgs() << "Fusion done:\n"); + + return FC0.L; + } + + /// Report details on loop fusion opportunities. + /// + /// This template function can be used to report both successful and missed + /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should + /// be one of: + /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful + /// given two valid fusion candidates. + /// - OptimizationRemark to report successful fusion of two fusion + /// candidates. + /// The remarks will be printed using the form: + /// <path/filename>:<line number>:<column number>: [<function name>]: + /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description> + template <typename RemarkKind> + void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1, + llvm::Statistic &Stat) { + assert(FC0.Preheader && FC1.Preheader && + "Expecting valid fusion candidates"); + using namespace ore; + ++Stat; + ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(), + FC0.Preheader) + << "[" << FC0.Preheader->getParent()->getName() + << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName())) + << " and " << NV("Cand2", StringRef(FC1.Preheader->getName())) + << ": " << Stat.getDesc()); + } + + /// Fuse two guarded fusion candidates, creating a new fused loop. + /// + /// Fusing guarded loops is handled much the same way as fusing non-guarded + /// loops. The rewiring of the CFG is slightly different though, because of + /// the presence of the guards around the loops and the exit blocks after the + /// loop body. As such, the new loop is rewired as follows: + /// 1. Keep the guard branch from FC0 and use the non-loop block target + /// from the FC1 guard branch. + /// 2. Remove the exit block from FC0 (this exit block should be empty + /// right now). + /// 3. Remove the guard branch for FC1 + /// 4. Remove the preheader for FC1. + /// The exit block successor for the latch of FC0 is updated to be the header + /// of FC1 and the non-exit block successor of the latch of FC1 is updated to + /// be the header of FC0, thus creating the fused loop. + Loop *fuseGuardedLoops(const FusionCandidate &FC0, + const FusionCandidate &FC1) { + assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops"); + + BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent(); + BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent(); + BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock(); + BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock(); + + assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent"); + + SmallVector<DominatorTree::UpdateType, 8> TreeUpdates; + + //////////////////////////////////////////////////////////////////////////// + // Update the Loop Guard + //////////////////////////////////////////////////////////////////////////// + // The guard for FC0 is updated to guard both FC0 and FC1. This is done by + // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1. + // Thus, one path from the guard goes to the preheader for FC0 (and thus + // executes the new fused loop) and the other path goes to the NonLoopBlock + // for FC1 (where FC1 guard would have gone if FC1 was not executed). + FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock); + FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock, + FC1.Header); + + // The guard of FC1 is not necessary anymore. + FC1.GuardBranch->eraseFromParent(); + new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock); + + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1GuardBlock, FC1.Preheader)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock)); + + assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && + "Expecting guard block to have no predecessors"); + assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && + "Expecting guard block to have no successors"); + + // Remember the phi nodes originally in the header of FC0 in order to rewire + // them later. However, this is only necessary if the new loop carried + // values might not dominate the exiting branch. While we do not generally + // test if this is the case but simply insert intermediate phi nodes, we + // need to make sure these intermediate phi nodes have different + // predecessors. To this end, we filter the special case where the exiting + // block is the latch block of the first loop. Nothing needs to be done + // anyway as all loop carried values dominate the latch and thereby also the + // exiting branch. + // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch + // (because the loops are rotated. Thus, nothing will ever be added to + // OriginalFC0PHIs. + SmallVector<PHINode *, 8> OriginalFC0PHIs; + if (FC0.ExitingBlock != FC0.Latch) + for (PHINode &PHI : FC0.Header->phis()) + OriginalFC0PHIs.push_back(&PHI); + + assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!"); + + // Replace incoming blocks for header PHIs first. + FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader); + FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch); + + // The old exiting block of the first loop (FC0) has to jump to the header + // of the second as we need to execute the code in the second header block + // regardless of the trip count. That is, if the trip count is 0, so the + // back edge is never taken, we still have to execute both loop headers, + // especially (but not only!) if the second is a do-while style loop. + // However, doing so might invalidate the phi nodes of the first loop as + // the new values do only need to dominate their latch and not the exiting + // predicate. To remedy this potential problem we always introduce phi + // nodes in the header of the second loop later that select the loop carried + // value, if the second header was reached through an old latch of the + // first, or undef otherwise. This is sound as exiting the first implies the + // second will exit too, __without__ taking the back-edge (their + // trip-counts are equal after all). + FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock, + FC1.Header); + + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock)); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.ExitingBlock, FC1.Header)); + + // Remove FC0 Exit Block + // The exit block for FC0 is no longer needed since control will flow + // directly to the header of FC1. Since it is an empty block, it can be + // removed at this point. + // TODO: In the future, we can handle non-empty exit blocks my merging any + // instructions from FC0 exit block into FC1 exit block prior to removing + // the block. + assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) && + "Expecting exit block to be empty"); + FC0.ExitBlock->getTerminator()->eraseFromParent(); + new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock); + + // Remove FC1 Preheader + // The pre-header of L1 is not necessary anymore. + assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader)); + FC1.Preheader->getTerminator()->eraseFromParent(); + new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader); + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Delete, FC1.Preheader, FC1.Header)); + + // Moves the phi nodes from the second to the first loops header block. + while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) { + if (SE.isSCEVable(PHI->getType())) + SE.forgetValue(PHI); + if (PHI->hasNUsesOrMore(1)) + PHI->moveBefore(&*FC0.Header->getFirstInsertionPt()); + else + PHI->eraseFromParent(); + } + + // Introduce new phi nodes in the second loop header to ensure + // exiting the first and jumping to the header of the second does not break + // the SSA property of the phis originally in the first loop. See also the + // comment above. + Instruction *L1HeaderIP = &FC1.Header->front(); + for (PHINode *LCPHI : OriginalFC0PHIs) { + int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch); + assert(L1LatchBBIdx >= 0 && + "Expected loop carried value to be rewired at this point!"); + + Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx); + + PHINode *L1HeaderPHI = PHINode::Create( + LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP); + L1HeaderPHI->addIncoming(LCV, FC0.Latch); + L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()), + FC0.ExitingBlock); + + LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI); + } + + // Update the latches + + // Replace latch terminator destinations. + FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header); + FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header); + + // Change the condition of FC0 latch branch to true, as both successors of + // the branch are the same. + simplifyLatchBranch(FC0); + + // If FC0.Latch and FC0.ExitingBlock are the same then we have already + // performed the updates above. + if (FC0.Latch != FC0.ExitingBlock) + TreeUpdates.emplace_back(DominatorTree::UpdateType( + DominatorTree::Insert, FC0.Latch, FC1.Header)); + + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC0.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert, + FC1.Latch, FC0.Header)); + TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete, + FC1.Latch, FC1.Header)); + + // All done + // Apply the updates to the Dominator Tree and cleanup. + + assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) && + "FC1GuardBlock has successors!!"); + assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) && + "FC1GuardBlock has predecessors!!"); + + // Update DT/PDT + DTU.applyUpdates(TreeUpdates); + + LI.removeBlock(FC1.Preheader); + DTU.deleteBB(FC1.Preheader); + DTU.deleteBB(FC0.ExitBlock); + DTU.flush(); + + // Is there a way to keep SE up-to-date so we don't need to forget the loops + // and rebuild the information in subsequent passes of fusion? + // Note: Need to forget the loops before merging the loop latches, as + // mergeLatch may remove the only block in FC1. + SE.forgetLoop(FC1.L); + SE.forgetLoop(FC0.L); + + // Move instructions from FC0.Latch to FC1.Latch. + // Note: mergeLatch requires an updated DT. + mergeLatch(FC0, FC1); + + // Merge the loops. + SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(), + FC1.L->block_end()); + for (BasicBlock *BB : Blocks) { + FC0.L->addBlockEntry(BB); + FC1.L->removeBlockFromLoop(BB); + if (LI.getLoopFor(BB) != FC1.L) + continue; + LI.changeLoopFor(BB, FC0.L); + } + while (!FC1.L->empty()) { + const auto &ChildLoopIt = FC1.L->begin(); + Loop *ChildLoop = *ChildLoopIt; + FC1.L->removeChildLoop(ChildLoopIt); + FC0.L->addChildLoop(ChildLoop); + } + + // Delete the now empty loop L1. + LI.erase(FC1.L); + +#ifndef NDEBUG + assert(!verifyFunction(*FC0.Header->getParent(), &errs())); + assert(DT.verify(DominatorTree::VerificationLevel::Fast)); + assert(PDT.verify()); + LI.verify(DT); + SE.verify(); +#endif LLVM_DEBUG(dbgs() << "Fusion done:\n"); @@ -1177,6 +1633,7 @@ struct LoopFuseLegacy : public FunctionPass { return LF.fuseLoops(F); } }; +} // namespace PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { auto &LI = AM.getResult<LoopAnalysis>(F); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index e561494f19cf..b77843d7cd71 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -81,6 +81,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -171,7 +172,7 @@ private: bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, - unsigned StoreAlignment, Value *StoredVal, + MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, @@ -217,7 +218,8 @@ public: LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( *L->getHeader()->getParent()); @@ -729,7 +731,8 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, bool NegStride = StoreSize == -Stride; - if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), + if (processLoopStridedStore(StorePtr, StoreSize, + MaybeAlign(HeadStore->getAlignment()), StoredVal, HeadStore, AdjacentStores, StoreEv, BECount, NegStride)) { TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); @@ -784,9 +787,9 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, SmallPtrSet<Instruction *, 1> MSIs; MSIs.insert(MSI); bool NegStride = SizeInBytes == -Stride; - return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, - MSI->getDestAlignment(), SplatValue, MSI, MSIs, - Ev, BECount, NegStride, /*IsLoopMemset=*/true); + return processLoopStridedStore( + Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()), + SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true); } /// mayLoopAccessLocation - Return true if the specified loop might access the @@ -876,7 +879,7 @@ static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, /// processLoopStridedStore - We see a strided store of some value. If we can /// transform this into a memset or memset_pattern in the loop preheader, do so. bool LoopIdiomRecognize::processLoopStridedStore( - Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, + Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment, Value *StoredVal, Instruction *TheStore, SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, const SCEV *BECount, bool NegStride, bool IsLoopMemset) { @@ -898,12 +901,12 @@ bool LoopIdiomRecognize::processLoopStridedStore( SCEVExpander Expander(*SE, *DL, "loop-idiom"); Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); - Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); + Type *IntIdxTy = DL->getIndexType(DestPtr->getType()); const SCEV *Start = Ev->getStart(); // Handle negative strided loops. if (NegStride) - Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); + Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -931,7 +934,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( // Okay, everything looks good, insert the memset. const SCEV *NumBytesS = - getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE); + getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); // TODO: ideally we should still be able to generate memset if SCEV expander // is taught to generate the dependencies at the latest point. @@ -939,12 +942,12 @@ bool LoopIdiomRecognize::processLoopStridedStore( return false; Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); CallInst *NewCall; if (SplatValue) { - NewCall = - Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); + NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, + MaybeAlign(StoreAlignment)); } else { // Everything is emitted in default address space Type *Int8PtrTy = DestInt8PtrTy; @@ -952,7 +955,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Module *M = TheStore->getModule(); StringRef FuncName = "memset_pattern16"; FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(), - Int8PtrTy, Int8PtrTy, IntPtr); + Int8PtrTy, Int8PtrTy, IntIdxTy); inferLibFuncAttributes(M, FuncName, *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be @@ -961,7 +964,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern"); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. - GV->setAlignment(16); + GV->setAlignment(Align(16)); Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); } @@ -1019,11 +1022,11 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *StrStart = StoreEv->getStart(); unsigned StrAS = SI->getPointerAddressSpace(); - Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); + Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS)); // Handle negative strided loops. if (NegStride) - StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); + StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE); // Okay, we have a strided store "p[i]" of a loaded value. We can turn // this into a memcpy in the loop preheader now if we want. However, this @@ -1049,7 +1052,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Handle negative strided loops. if (NegStride) - LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); + LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE); // For a memcpy, we have to make sure that the input array is not being // mutated by the loop. @@ -1071,18 +1074,18 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // Okay, everything is safe, we can transform this! const SCEV *NumBytesS = - getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE); + getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE); Value *NumBytes = - Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); + Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator()); CallInst *NewCall = nullptr; // Check whether to generate an unordered atomic memcpy: // If the load or store are atomic, then they must necessarily be unordered // by previous checks. if (!SI->isAtomic() && !LI->isAtomic()) - NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(), - LoadBasePtr, LI->getAlignment(), NumBytes); + NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr, + LI->getAlign(), NumBytes); else { // We cannot allow unaligned ops for unordered load/store, so reject // anything where the alignment isn't at least the element size. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index 31191b52895c..901204181a7c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/User.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Scalar.h" @@ -192,7 +193,8 @@ public: getAnalysis<AssumptionCacheTracker>().getAssumptionCache( *L->getHeader()->getParent()); const TargetLibraryInfo &TLI = - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); MemorySSA *MSSA = nullptr; Optional<MemorySSAUpdater> MSSAU; if (EnableMSSALoopDependency) { @@ -225,7 +227,8 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, Optional<MemorySSAUpdater> MSSAU; if (AR.MSSA) { MSSAU = MemorySSAUpdater(AR.MSSA); - AR.MSSA->verifyMemorySSA(); + if (VerifyMemorySSA) + AR.MSSA->verifyMemorySSA(); } if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr)) @@ -233,7 +236,7 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, auto PA = getLoopPassPreservedAnalyses(); PA.preserveSet<CFGAnalyses>(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 9a42365adc1b..6ce2d06058cf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -33,6 +33,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -410,8 +411,6 @@ public: void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); private: - void splitInnerLoopLatch(Instruction *); - void splitInnerLoopHeader(); bool adjustLoopLinks(); void adjustLoopPreheaders(); bool adjustLoopBranches(); @@ -718,22 +717,6 @@ bool LoopInterchangeLegality::findInductionAndReductions( return true; } -static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) { - for (PHINode &PHI : Block->phis()) { - // Reduction lcssa phi will have only 1 incoming block that from loop latch. - if (PHI.getNumIncomingValues() > 1) - return false; - Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0)); - if (!Ins) - return false; - // Incoming value for lcssa phi's in outer loop exit can only be inner loop - // exits lcssa phi else it would not be tightly nested. - if (!isa<PHINode>(Ins) && isOuterLoopExitBlock) - return false; - } - return true; -} - // This function indicates the current limitations in the transform as a result // of which we do not proceed. bool LoopInterchangeLegality::currentLimitations() { @@ -832,21 +815,6 @@ bool LoopInterchangeLegality::currentLimitations() { return true; } - // TODO: We only handle LCSSA PHI's corresponding to reduction for now. - BasicBlock *InnerExit = InnerLoop->getExitBlock(); - if (!containsSafePHI(InnerExit, false)) { - LLVM_DEBUG( - dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner", - InnerLoop->getStartLoc(), - InnerLoop->getHeader()) - << "Only inner loops with LCSSA PHIs can be interchange " - "currently."; - }); - return true; - } - // TODO: Current limitation: Since we split the inner loop latch at the point // were induction variable is incremented (induction.next); We cannot have // more than 1 user of induction.next since it would result in broken code @@ -922,6 +890,28 @@ bool LoopInterchangeLegality::currentLimitations() { return false; } +// We currently only support LCSSA PHI nodes in the inner loop exit, if their +// users are either reduction PHIs or PHIs outside the outer loop (which means +// the we are only interested in the final value after the loop). +static bool +areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, + SmallPtrSetImpl<PHINode *> &Reductions) { + BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); + for (PHINode &PHI : InnerExit->phis()) { + // Reduction lcssa phi will have only 1 incoming block that from loop latch. + if (PHI.getNumIncomingValues() > 1) + return false; + if (any_of(PHI.users(), [&Reductions, OuterL](User *U) { + PHINode *PN = dyn_cast<PHINode>(U); + return !PN || (Reductions.find(PN) == Reductions.end() && + OuterL->contains(PN->getParent())); + })) { + return false; + } + } + return true; +} + // We currently support LCSSA PHI nodes in the outer loop exit, if their // incoming values do not come from the outer loop latch or if the // outer loop latch has a single predecessor. In that case, the value will @@ -929,7 +919,7 @@ bool LoopInterchangeLegality::currentLimitations() { // will still be true after interchanging. If we have multiple predecessor, // that may not be the case, e.g. because the outer loop latch may be executed // if the inner loop is not executed. -static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { +static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); for (PHINode &PHI : LoopNestExit->phis()) { // FIXME: We currently are not able to detect floating point reductions @@ -1014,7 +1004,19 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, return false; } - if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) { + if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop, + OuterInnerReductions)) { + LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n"); + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Found unsupported PHI node in loop exit."; + }); + return false; + } + + if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n"); ORE->emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI", @@ -1226,7 +1228,7 @@ bool LoopInterchangeTransform::transform() { if (InnerLoop->getSubLoops().empty()) { BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); - LLVM_DEBUG(dbgs() << "Calling Split Inner Loop\n"); + LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n"); PHINode *InductionPHI = getInductionVariable(InnerLoop, SE); if (!InductionPHI) { LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n"); @@ -1242,11 +1244,55 @@ bool LoopInterchangeTransform::transform() { if (&InductionPHI->getParent()->front() != InductionPHI) InductionPHI->moveBefore(&InductionPHI->getParent()->front()); - // Split at the place were the induction variable is - // incremented/decremented. - // TODO: This splitting logic may not work always. Fix this. - splitInnerLoopLatch(InnerIndexVar); - LLVM_DEBUG(dbgs() << "splitInnerLoopLatch done\n"); + // Create a new latch block for the inner loop. We split at the + // current latch's terminator and then move the condition and all + // operands that are not either loop-invariant or the induction PHI into the + // new latch block. + BasicBlock *NewLatch = + SplitBlock(InnerLoop->getLoopLatch(), + InnerLoop->getLoopLatch()->getTerminator(), DT, LI); + + SmallSetVector<Instruction *, 4> WorkList; + unsigned i = 0; + auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() { + for (; i < WorkList.size(); i++) { + // Duplicate instruction and move it the new latch. Update uses that + // have been moved. + Instruction *NewI = WorkList[i]->clone(); + NewI->insertBefore(NewLatch->getFirstNonPHI()); + assert(!NewI->mayHaveSideEffects() && + "Moving instructions with side-effects may change behavior of " + "the loop nest!"); + for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end(); + UI != UE;) { + Use &U = *UI++; + Instruction *UserI = cast<Instruction>(U.getUser()); + if (!InnerLoop->contains(UserI->getParent()) || + UserI->getParent() == NewLatch || UserI == InductionPHI) + U.set(NewI); + } + // Add operands of moved instruction to the worklist, except if they are + // outside the inner loop or are the induction PHI. + for (Value *Op : WorkList[i]->operands()) { + Instruction *OpI = dyn_cast<Instruction>(Op); + if (!OpI || + this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop || + OpI == InductionPHI) + continue; + WorkList.insert(OpI); + } + } + }; + + // FIXME: Should we interchange when we have a constant condition? + Instruction *CondI = dyn_cast<Instruction>( + cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator()) + ->getCondition()); + if (CondI) + WorkList.insert(CondI); + MoveInstructions(); + WorkList.insert(cast<Instruction>(InnerIndexVar)); + MoveInstructions(); // Splits the inner loops phi nodes out into a separate basic block. BasicBlock *InnerLoopHeader = InnerLoop->getHeader(); @@ -1263,10 +1309,6 @@ bool LoopInterchangeTransform::transform() { return true; } -void LoopInterchangeTransform::splitInnerLoopLatch(Instruction *Inc) { - SplitBlock(InnerLoop->getLoopLatch(), Inc, DT, LI); -} - /// \brief Move all instructions except the terminator from FromBB right before /// InsertBefore static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { @@ -1277,31 +1319,39 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { FromBB->getTerminator()->getIterator()); } -/// Update BI to jump to NewBB instead of OldBB. Records updates to -/// the dominator tree in DTUpdates, if DT should be preserved. +// Update BI to jump to NewBB instead of OldBB. Records updates to the +// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that +// \p OldBB is exactly once in BI's successor list. static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, - std::vector<DominatorTree::UpdateType> &DTUpdates) { - assert(llvm::count_if(successors(BI), - [OldBB](BasicBlock *BB) { return BB == OldBB; }) < 2 && - "BI must jump to OldBB at most once."); - for (unsigned i = 0, e = BI->getNumSuccessors(); i < e; ++i) { - if (BI->getSuccessor(i) == OldBB) { - BI->setSuccessor(i, NewBB); - - DTUpdates.push_back( - {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); - DTUpdates.push_back( - {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); - break; + std::vector<DominatorTree::UpdateType> &DTUpdates, + bool MustUpdateOnce = true) { + assert((!MustUpdateOnce || + llvm::count_if(successors(BI), + [OldBB](BasicBlock *BB) { + return BB == OldBB; + }) == 1) && "BI must jump to OldBB exactly once."); + bool Changed = false; + for (Use &Op : BI->operands()) + if (Op == OldBB) { + Op.set(NewBB); + Changed = true; } + + if (Changed) { + DTUpdates.push_back( + {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); + DTUpdates.push_back( + {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); } + assert(Changed && "Expected a successor to be updated"); } // Move Lcssa PHIs to the right place. static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, - BasicBlock *OuterLatch, BasicBlock *OuterExit) { + BasicBlock *OuterLatch, BasicBlock *OuterExit, + Loop *InnerLoop, LoopInfo *LI) { // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are // defined either in the header or latch. Those blocks will become header and @@ -1356,19 +1406,17 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, P->moveBefore(InnerExit->getFirstNonPHI()); // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have - // incoming values from the outer latch or header, we have to add a new PHI + // incoming values defined in the outer loop, we have to add a new PHI // in the inner loop latch, which became the exit block of the outer loop, // after interchanging. if (OuterExit) { for (PHINode &P : OuterExit->phis()) { if (P.getNumIncomingValues() != 1) continue; - // Skip Phis with incoming values not defined in the outer loop's header - // and latch. Also skip incoming phis defined in the latch. Those should + // Skip Phis with incoming values defined in the inner loop. Those should // already have been updated. auto I = dyn_cast<Instruction>(P.getIncomingValue(0)); - if (!I || ((I->getParent() != OuterLatch || isa<PHINode>(I)) && - I->getParent() != OuterHeader)) + if (!I || LI->getLoopFor(I->getParent()) == InnerLoop) continue; PHINode *NewPhi = dyn_cast<PHINode>(P.clone()); @@ -1443,12 +1491,21 @@ bool LoopInterchangeTransform::adjustLoopBranches() { if (!InnerLoopHeaderSuccessor) return false; - // Adjust Loop Preheader and headers + // Adjust Loop Preheader and headers. + // The branches in the outer loop predecessor and the outer loop header can + // be unconditional branches or conditional branches with duplicates. Consider + // this when updating the successors. updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader, - InnerLoopPreHeader, DTUpdates); - updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates); + InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); + // The outer loop header might or might not branch to the outer latch. + // We are guaranteed to branch to the inner loop preheader. + if (std::find(succ_begin(OuterLoopHeaderBI), succ_end(OuterLoopHeaderBI), + OuterLoopLatch) != succ_end(OuterLoopHeaderBI)) + updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates, + /*MustUpdateOnce=*/false); updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader, - InnerLoopHeaderSuccessor, DTUpdates); + InnerLoopHeaderSuccessor, DTUpdates, + /*MustUpdateOnce=*/false); // Adjust reduction PHI's now that the incoming block has changed. InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader, @@ -1482,7 +1539,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() { OuterLoopPreHeader); moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch, - OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock()); + OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(), + InnerLoop, LI); // For PHIs in the exit block of the outer loop, outer's latch has been // replaced by Inners'. OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 2b3d5e0ce9b7..4e1b4e87ebc9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -49,6 +49,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -435,7 +436,8 @@ public: PH->getTerminator()); Value *Initial = new LoadInst( Cand.Load->getType(), InitialPtr, "load_initial", - /* isVolatile */ false, Cand.Load->getAlignment(), PH->getTerminator()); + /* isVolatile */ false, MaybeAlign(Cand.Load->getAlignment()), + PH->getTerminator()); PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded", &L->getHeader()->front()); @@ -487,7 +489,7 @@ public: // Filter the candidates further. SmallVector<StoreToLoadForwardingCandidate, 4> Candidates; unsigned NumForwarding = 0; - for (const StoreToLoadForwardingCandidate Cand : StoreToLoadDependences) { + for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) { LLVM_DEBUG(dbgs() << "Candidate " << Cand); // Make sure that the stored values is available everywhere in the loop in @@ -543,7 +545,8 @@ public: auto *HeaderBB = L->getHeader(); auto *F = HeaderBB->getParent(); bool OptForSize = F->hasOptSize() || - llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI); + llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI, + PGSOQueryType::IRPass); if (OptForSize) { LLVM_DEBUG( dbgs() << "Versioning is needed but not allowed when optimizing " diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp index 507a1e251ca6..1a42f6b23443 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -191,9 +191,12 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/GuardUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -248,7 +251,9 @@ struct LoopICmp { class LoopPredication { AliasAnalysis *AA; + DominatorTree *DT; ScalarEvolution *SE; + LoopInfo *LI; BranchProbabilityInfo *BPI; Loop *L; @@ -300,10 +305,13 @@ class LoopPredication { // within the loop. We identify such unprofitable loops through BPI. bool isLoopProfitableToPredicate(); + bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter); + public: - LoopPredication(AliasAnalysis *AA, ScalarEvolution *SE, + LoopPredication(AliasAnalysis *AA, DominatorTree *DT, + ScalarEvolution *SE, LoopInfo *LI, BranchProbabilityInfo *BPI) - : AA(AA), SE(SE), BPI(BPI){}; + : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {}; bool runOnLoop(Loop *L); }; @@ -323,10 +331,12 @@ public: if (skipLoop(L)) return false; auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI(); auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - LoopPredication LP(AA, SE, &BPI); + LoopPredication LP(AA, DT, SE, LI, &BPI); return LP.runOnLoop(L); } }; @@ -352,7 +362,7 @@ PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM, AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); Function *F = L.getHeader()->getParent(); auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F); - LoopPredication LP(&AR.AA, &AR.SE, BPI); + LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, BPI); if (!LP.runOnLoop(&L)) return PreservedAnalyses::all(); @@ -543,7 +553,7 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) { if (const auto *LI = dyn_cast<LoadInst>(U->getValue())) if (LI->isUnordered() && L->hasLoopInvariantOperands(LI)) if (AA->pointsToConstantMemory(LI->getOperand(0)) || - LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr) + LI->hasMetadata(LLVMContext::MD_invariant_load)) return true; return false; } @@ -823,9 +833,9 @@ bool LoopPredication::widenWidenableBranchGuardConditions( Value *AllChecks = Builder.CreateAnd(Checks); auto *OldCond = BI->getCondition(); BI->setCondition(AllChecks); + RecursivelyDeleteTriviallyDeadInstructions(OldCond); assert(isGuardAsWidenableBranch(BI) && "Stopped being a guard after transform?"); - RecursivelyDeleteTriviallyDeadInstructions(OldCond); LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); return true; @@ -953,6 +963,233 @@ bool LoopPredication::isLoopProfitableToPredicate() { return true; } +/// If we can (cheaply) find a widenable branch which controls entry into the +/// loop, return it. +static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) { + // Walk back through any unconditional executed blocks and see if we can find + // a widenable condition which seems to control execution of this loop. Note + // that we predict that maythrow calls are likely untaken and thus that it's + // profitable to widen a branch before a maythrow call with a condition + // afterwards even though that may cause the slow path to run in a case where + // it wouldn't have otherwise. + BasicBlock *BB = L->getLoopPreheader(); + if (!BB) + return nullptr; + do { + if (BasicBlock *Pred = BB->getSinglePredecessor()) + if (BB == Pred->getSingleSuccessor()) { + BB = Pred; + continue; + } + break; + } while (true); + + if (BasicBlock *Pred = BB->getSinglePredecessor()) { + auto *Term = Pred->getTerminator(); + + Value *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(Term, Cond, WC, IfTrueBB, IfFalseBB) && + IfTrueBB == BB) + return cast<BranchInst>(Term); + } + return nullptr; +} + +/// Return the minimum of all analyzeable exit counts. This is an upper bound +/// on the actual exit count. If there are not at least two analyzeable exits, +/// returns SCEVCouldNotCompute. +static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE, + DominatorTree &DT, + Loop *L) { + SmallVector<BasicBlock *, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + SmallVector<const SCEV *, 4> ExitCounts; + for (BasicBlock *ExitingBB : ExitingBlocks) { + const SCEV *ExitCount = SE.getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount)) + continue; + assert(DT.dominates(ExitingBB, L->getLoopLatch()) && + "We should only have known counts for exiting blocks that " + "dominate latch!"); + ExitCounts.push_back(ExitCount); + } + if (ExitCounts.size() < 2) + return SE.getCouldNotCompute(); + return SE.getUMinFromMismatchedTypes(ExitCounts); +} + +/// Return true if we can be fairly sure that executing block BB will probably +/// lead to executing an __llvm_deoptimize. This is a profitability heuristic, +/// not a legality constraint. +static bool isVeryLikelyToDeopt(BasicBlock *BB) { + while (BB->getUniqueSuccessor()) + // Will skip side effects, that's okay + BB = BB->getUniqueSuccessor(); + + return BB->getTerminatingDeoptimizeCall(); +} + +/// This implements an analogous, but entirely distinct transform from the main +/// loop predication transform. This one is phrased in terms of using a +/// widenable branch *outside* the loop to allow us to simplify loop exits in a +/// following loop. This is close in spirit to the IndVarSimplify transform +/// of the same name, but is materially different widening loosens legality +/// sharply. +bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) { + // The transformation performed here aims to widen a widenable condition + // above the loop such that all analyzeable exit leading to deopt are dead. + // It assumes that the latch is the dominant exit for profitability and that + // exits branching to deoptimizing blocks are rarely taken. It relies on the + // semantics of widenable expressions for legality. (i.e. being able to fall + // down the widenable path spuriously allows us to ignore exit order, + // unanalyzeable exits, side effects, exceptional exits, and other challenges + // which restrict the applicability of the non-WC based version of this + // transform in IndVarSimplify.) + // + // NOTE ON POISON/UNDEF - We're hoisting an expression above guards which may + // imply flags on the expression being hoisted and inserting new uses (flags + // are only correct for current uses). The result is that we may be + // inserting a branch on the value which can be either poison or undef. In + // this case, the branch can legally go either way; we just need to avoid + // introducing UB. This is achieved through the use of the freeze + // instruction. + + SmallVector<BasicBlock *, 16> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); + + if (ExitingBlocks.empty()) + return false; // Nothing to do. + + auto *Latch = L->getLoopLatch(); + if (!Latch) + return false; + + auto *WidenableBR = FindWidenableTerminatorAboveLoop(L, *LI); + if (!WidenableBR) + return false; + + const SCEV *LatchEC = SE->getExitCount(L, Latch); + if (isa<SCEVCouldNotCompute>(LatchEC)) + return false; // profitability - want hot exit in analyzeable set + + // At this point, we have found an analyzeable latch, and a widenable + // condition above the loop. If we have a widenable exit within the loop + // (for which we can't compute exit counts), drop the ability to further + // widen so that we gain ability to analyze it's exit count and perform this + // transform. TODO: It'd be nice to know for sure the exit became + // analyzeable after dropping widenability. + { + bool Invalidate = false; + + for (auto *ExitingBB : ExitingBlocks) { + if (LI->getLoopFor(ExitingBB) != L) + continue; + + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; + + Use *Cond, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) && + L->contains(IfTrueBB)) { + WC->set(ConstantInt::getTrue(IfTrueBB->getContext())); + Invalidate = true; + } + } + if (Invalidate) + SE->forgetLoop(L); + } + + // The use of umin(all analyzeable exits) instead of latch is subtle, but + // important for profitability. We may have a loop which hasn't been fully + // canonicalized just yet. If the exit we chose to widen is provably never + // taken, we want the widened form to *also* be provably never taken. We + // can't guarantee this as a current unanalyzeable exit may later become + // analyzeable, but we can at least avoid the obvious cases. + const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L); + if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() || + !SE->isLoopInvariant(MinEC, L) || + !isSafeToExpandAt(MinEC, WidenableBR, *SE)) + return false; + + // Subtlety: We need to avoid inserting additional uses of the WC. We know + // that it can only have one transitive use at the moment, and thus moving + // that use to just before the branch and inserting code before it and then + // modifying the operand is legal. + auto *IP = cast<Instruction>(WidenableBR->getCondition()); + IP->moveBefore(WidenableBR); + Rewriter.setInsertPoint(IP); + IRBuilder<> B(IP); + + bool Changed = false; + Value *MinECV = nullptr; // lazily generated if needed + for (BasicBlock *ExitingBB : ExitingBlocks) { + // If our exiting block exits multiple loops, we can only rewrite the + // innermost one. Otherwise, we're changing how many times the innermost + // loop runs before it exits. + if (LI->getLoopFor(ExitingBB) != L) + continue; + + // Can't rewrite non-branch yet. + auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator()); + if (!BI) + continue; + + // If already constant, nothing to do. + if (isa<Constant>(BI->getCondition())) + continue; + + const SCEV *ExitCount = SE->getExitCount(L, ExitingBB); + if (isa<SCEVCouldNotCompute>(ExitCount) || + ExitCount->getType()->isPointerTy() || + !isSafeToExpandAt(ExitCount, WidenableBR, *SE)) + continue; + + const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB)); + BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1); + if (!isVeryLikelyToDeopt(ExitBB)) + // Profitability: indicator of rarely/never taken exit + continue; + + // If we found a widenable exit condition, do two things: + // 1) fold the widened exit test into the widenable condition + // 2) fold the branch to untaken - avoids infinite looping + + Value *ECV = Rewriter.expandCodeFor(ExitCount); + if (!MinECV) + MinECV = Rewriter.expandCodeFor(MinEC); + Value *RHS = MinECV; + if (ECV->getType() != RHS->getType()) { + Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType()); + ECV = B.CreateZExt(ECV, WiderTy); + RHS = B.CreateZExt(RHS, WiderTy); + } + assert(!Latch || DT->dominates(ExitingBB, Latch)); + Value *NewCond = B.CreateICmp(ICmpInst::ICMP_UGT, ECV, RHS); + // Freeze poison or undef to an arbitrary bit pattern to ensure we can + // branch without introducing UB. See NOTE ON POISON/UNDEF above for + // context. + NewCond = B.CreateFreeze(NewCond); + + widenWidenableBranch(WidenableBR, NewCond); + + Value *OldCond = BI->getCondition(); + BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue)); + Changed = true; + } + + if (Changed) + // We just mutated a bunch of loop exits changing there exit counts + // widely. We need to force recomputation of the exit counts given these + // changes. Note that all of the inserted exits are never taken, and + // should be removed next time the CFG is modified. + SE->forgetLoop(L); + return Changed; +} + bool LoopPredication::runOnLoop(Loop *Loop) { L = Loop; @@ -1004,16 +1241,12 @@ bool LoopPredication::runOnLoop(Loop *Loop) { cast<BranchInst>(BB->getTerminator())); } - if (Guards.empty() && GuardsAsWidenableBranches.empty()) - return false; - SCEVExpander Expander(*SE, *DL, "loop-predication"); - bool Changed = false; for (auto *Guard : Guards) Changed |= widenGuardConditions(Guard, Expander); for (auto *Guard : GuardsAsWidenableBranches) Changed |= widenWidenableBranchGuardConditions(Guard, Expander); - + Changed |= predicateLoopExits(L, Expander); return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 166b57f20b43..da13a342ae12 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -45,6 +44,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -53,6 +53,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include <cassert> #include <cstddef> @@ -1644,7 +1645,8 @@ bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) { AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp index e009947690af..0868e742f4ee 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -18,6 +18,8 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" @@ -55,7 +57,7 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, AR.MSSA->verifyMemorySSA(); auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; } @@ -94,17 +96,15 @@ public: auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); - auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; - auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); - auto *SE = SEWP ? &SEWP->getSE() : nullptr; + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); Optional<MemorySSAUpdater> MSSAU; if (EnableMSSALoopDependency) { MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); MSSAU = MemorySSAUpdater(MSSA); } - return LoopRotation(L, LI, TTI, AC, DT, SE, + return LoopRotation(L, LI, TTI, AC, &DT, &SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false, MaxHeaderSize, false); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 046f4c8af492..b27e65e0adb7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -30,6 +30,8 @@ #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils.h" @@ -660,6 +662,9 @@ static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, // Merge Succ into Pred and delete it. MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU); + if (MSSAU && VerifyMemorySSA) + MSSAU->getMemorySSA()->verifyMemorySSA(); + Changed = true; } @@ -690,7 +695,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &LPMU) { Optional<MemorySSAUpdater> MSSAU; - if (EnableMSSALoopDependency && AR.MSSA) + if (AR.MSSA) MSSAU = MemorySSAUpdater(AR.MSSA); bool DeleteCurrentLoop = false; if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE, @@ -702,7 +707,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, LPMU.markLoopAsDeleted(L, "loop-simplifycfg"); auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp index 975452e13f09..1c03a4bf6c02 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -41,14 +41,15 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -230,12 +231,9 @@ static bool sinkInstruction(Loop &L, Instruction &I, IC->setName(I.getName()); IC->insertBefore(&*N->getFirstInsertionPt()); // Replaces uses of I with IC in N - for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) { - Use &U = *UI++; - auto *I = cast<Instruction>(U.getUser()); - if (I->getParent() == N) - U.set(IC); - } + I.replaceUsesWithIf(IC, [N](Use &U) { + return cast<Instruction>(U.getUser())->getParent() == N; + }); // Replaces uses of I with IC in blocks dominated by N replaceDominatedUsesWith(&I, IC, DT, N); LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName() diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 59a387a186b8..e9f368628a08 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -74,7 +74,6 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -97,6 +96,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -108,6 +108,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -115,8 +116,8 @@ #include <cstdlib> #include <iterator> #include <limits> -#include <numeric> #include <map> +#include <numeric> #include <utility> using namespace llvm; @@ -1386,7 +1387,9 @@ void Cost::RateFormula(const Formula &F, // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as // additional instruction (at least fill). - unsigned TTIRegNum = TTI->getNumberOfRegisters(false) - 1; + // TODO: Need distinguish register class? + unsigned TTIRegNum = TTI->getNumberOfRegisters( + TTI->getRegisterClassForType(false, F.getType())) - 1; if (C.NumRegs > TTIRegNum) { // Cost already exceeded TTIRegNum, then only newly added register can add // new instructions. @@ -3165,6 +3168,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n"); return; } + assert(IVSrc && "Failed to find IV chain source"); LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n"); Type *IVTy = IVSrc->getType(); @@ -3265,12 +3269,12 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // requirements for both N and i at the same time. Limiting this code to // equality icmps is not a problem because all interesting loops use // equality icmps, thanks to IndVarSimplify. - if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) + if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) { + // If CI can be saved in some target, like replaced inside hardware loop + // in PowerPC, no need to generate initial formulae for it. + if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition())) + continue; if (CI->isEquality()) { - // If CI can be saved in some target, like replaced inside hardware loop - // in PowerPC, no need to generate initial formulae for it. - if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition())) - continue; // Swap the operands if needed to put the OperandValToReplace on the // left, for consistency. Value *NV = CI->getOperand(1); @@ -3298,6 +3302,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { Factors.insert(-(uint64_t)Factors[i]); Factors.insert(-1); } + } // Get or create an LSRUse. std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy); @@ -4834,6 +4839,7 @@ void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() { } } } + assert(Best && "Failed to find best LSRUse candidate"); LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best << " will yield profitable reuse.\n"); @@ -5740,7 +5746,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) { *L->getHeader()->getParent()); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache( *L->getHeader()->getParent()); - auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( + *L->getHeader()->getParent()); return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 86891eb451bb..92ad8dafa5ab 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -166,7 +167,7 @@ static bool computeUnrollAndJamCount( bool UseUpperBound = false; bool ExplicitUnroll = computeUnrollCount( L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount, - OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); + /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound); if (ExplicitUnroll || UseUpperBound) { // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it // for the unroller instead. @@ -293,9 +294,9 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, if (Latch != Exit || SubLoopLatch != SubLoopExit) return LoopUnrollResult::Unmodified; - TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, SE, TTI, nullptr, nullptr, OptLevel, - None, None, None, None, None, None); + TargetTransformInfo::UnrollingPreferences UP = + gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None, + None, None, None, None, None, None, None); if (AllowUnrollAndJam.getNumOccurrences() > 0) UP.UnrollAndJam = AllowUnrollAndJam; if (UnrollAndJamThreshold.getNumOccurrences() > 0) @@ -426,51 +427,76 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, return UnrollResult; } +static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI, + ScalarEvolution &SE, + const TargetTransformInfo &TTI, + AssumptionCache &AC, DependenceInfo &DI, + OptimizationRemarkEmitter &ORE, + int OptLevel) { + bool DidSomething = false; + + // The loop unroll and jam pass requires loops to be in simplified form, and also needs LCSSA. + // Since simplification may add new inner loops, it has to run before the + // legality and profitability checks. This means running the loop unroll and jam pass + // will simplify all loops, regardless of whether anything end up being + // unroll and jammed. + for (auto &L : LI) { + DidSomething |= + simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */); + DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE); + } + + SmallPriorityWorklist<Loop *, 4> Worklist; + internal::appendLoopsToWorklist(reverse(LI), Worklist); + while (!Worklist.empty()) { + Loop *L = Worklist.pop_back_val(); + formLCSSA(*L, DT, &LI, &SE); + LoopUnrollResult Result = + tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel); + if (Result != LoopUnrollResult::Unmodified) + DidSomething = true; + } + + return DidSomething; +} + namespace { -class LoopUnrollAndJam : public LoopPass { +class LoopUnrollAndJam : public FunctionPass { public: static char ID; // Pass ID, replacement for typeid unsigned OptLevel; - LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) { + LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) { initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry()); } - bool runOnLoop(Loop *L, LPPassManager &LPM) override { - if (skipLoop(L)) + bool runOnFunction(Function &F) override { + if (skipFunction(F)) return false; - Function &F = *L->getHeader()->getParent(); - auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI(); - // For the old PM, we can't use OptimizationRemarkEmitter as an analysis - // pass. Function analyses need to be preserved across loop transformations - // but ORE cannot be preserved (see comment before the pass definition). - OptimizationRemarkEmitter ORE(&F); - - LoopUnrollResult Result = - tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); + auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); - if (Result == LoopUnrollResult::FullyUnrolled) - LPM.markLoopAsDeleted(*L); - - return Result != LoopUnrollResult::Unmodified; + return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel); } /// This transformation requires natural loop information & requires that /// loop preheaders be inserted into the CFG... void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<DependenceAnalysisWrapperPass>(); - getLoopAnalysisUsage(AU); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } }; @@ -480,10 +506,13 @@ char LoopUnrollAndJam::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam", "Unroll and Jam loops", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam", "Unroll and Jam loops", false, false) @@ -491,26 +520,18 @@ Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) { return new LoopUnrollAndJam(OptLevel); } -PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM, - LoopStandardAnalysisResults &AR, - LPMUpdater &) { - const auto &FAM = - AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); - Function *F = L.getHeader()->getParent(); - - auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F); - // FIXME: This should probably be optional rather than required. - if (!ORE) - report_fatal_error( - "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at " - "a higher level"); - - DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI); - - LoopUnrollResult Result = tryToUnrollAndJamLoop( - &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel); - - if (Result == LoopUnrollResult::Unmodified) +PreservedAnalyses LoopUnrollAndJamPass::run(Function &F, + FunctionAnalysisManager &AM) { + ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + LoopInfo &LI = AM.getResult<LoopAnalysis>(F); + TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); + AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); + DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); + DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F); + OptimizationRemarkEmitter &ORE = + AM.getResult<OptimizationRemarkEmitterAnalysis>(F); + + if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 2fa7436213dd..4c2b079c6bb5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -46,6 +46,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -178,7 +179,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel, Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, - Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) { + Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling, + Optional<bool> UserAllowProfileBasedPeeling, + Optional<unsigned> UserFullUnrollMaxCount) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults @@ -202,6 +205,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.UpperBound = false; UP.AllowPeeling = true; UP.UnrollAndJam = false; + UP.PeelProfiledIterations = true; UP.UnrollAndJamInnerLoopThreshold = 60; // Override with any target specific settings @@ -209,7 +213,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( // Apply size attributes bool OptForSize = L->getHeader()->getParent()->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI); + llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); if (OptForSize) { UP.Threshold = UP.OptSizeThreshold; UP.PartialThreshold = UP.PartialOptSizeThreshold; @@ -257,6 +262,10 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( UP.UpperBound = *UserUpperBound; if (UserAllowPeeling.hasValue()) UP.AllowPeeling = *UserAllowPeeling; + if (UserAllowProfileBasedPeeling.hasValue()) + UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling; + if (UserFullUnrollMaxCount.hasValue()) + UP.FullUnrollMaxCount = *UserFullUnrollMaxCount; return UP; } @@ -730,7 +739,7 @@ bool llvm::computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues, OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, - unsigned &TripMultiple, unsigned LoopSize, + bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. @@ -781,18 +790,34 @@ bool llvm::computeUnrollCount( // Also we need to check if we exceed FullUnrollMaxCount. // If using the upper bound to unroll, TripMultiple should be set to 1 because // we do not know when loop may exit. - // MaxTripCount and ExactTripCount cannot both be non zero since we only + + // We can unroll by the upper bound amount if it's generally allowed or if + // we know that the loop is executed either the upper bound or zero times. + // (MaxOrZero unrolling keeps only the first loop test, so the number of + // loop tests remains the same compared to the non-unrolled version, whereas + // the generic upper bound unrolling keeps all but the last loop test so the + // number of loop tests goes up which may end up being worse on targets with + // constrained branch predictor resources so is controlled by an option.) + // In addition we only unroll small upper bounds. + unsigned FullUnrollMaxTripCount = MaxTripCount; + if (!(UP.UpperBound || MaxOrZero) || + FullUnrollMaxTripCount > UnrollMaxUpperBound) + FullUnrollMaxTripCount = 0; + + // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only // compute the former when the latter is zero. unsigned ExactTripCount = TripCount; - assert((ExactTripCount == 0 || MaxTripCount == 0) && - "ExtractTripCount and MaxTripCount cannot both be non zero."); - unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount; + assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) && + "ExtractTripCount and UnrollByMaxCount cannot both be non zero."); + + unsigned FullUnrollTripCount = + ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount; UP.Count = FullUnrollTripCount; if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) { - UseUpperBound = (MaxTripCount == FullUnrollTripCount); + UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; @@ -806,7 +831,7 @@ bool llvm::computeUnrollCount( unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); if (Cost->UnrolledCost < UP.Threshold * Boost / 100) { - UseUpperBound = (MaxTripCount == FullUnrollTripCount); + UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount); TripCount = FullUnrollTripCount; TripMultiple = UP.UpperBound ? 1 : TripMultiple; return ExplicitUnroll; @@ -882,6 +907,8 @@ bool llvm::computeUnrollCount( "because " "unrolled size is too large."; }); + LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count + << "\n"); return ExplicitUnroll; } assert(TripCount == 0 && @@ -903,6 +930,12 @@ bool llvm::computeUnrollCount( return false; } + // Don't unroll a small upper bound loop unless user or TTI asked to do so. + if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) { + UP.Count = 0; + return false; + } + // Check if the runtime trip count is too small when profile is available. if (L->getHeader()->getParent()->hasProfileData()) { if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) { @@ -966,7 +999,11 @@ bool llvm::computeUnrollCount( if (UP.Count > UP.MaxCount) UP.Count = UP.MaxCount; - LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count + + if (MaxTripCount && UP.Count > MaxTripCount) + UP.Count = MaxTripCount; + + LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count << "\n"); if (UP.Count < 2) UP.Count = 0; @@ -976,13 +1013,14 @@ bool llvm::computeUnrollCount( static LoopUnrollResult tryToUnrollLoop( Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, - OptimizationRemarkEmitter &ORE, - BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, - bool PreserveLCSSA, int OptLevel, + OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel, bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound, - Optional<bool> ProvidedAllowPeeling) { + Optional<bool> ProvidedAllowPeeling, + Optional<bool> ProvidedAllowProfileBasedPeeling, + Optional<unsigned> ProvidedFullUnrollMaxCount) { LLVM_DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); @@ -1007,7 +1045,8 @@ static LoopUnrollResult tryToUnrollLoop( TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, - ProvidedAllowPeeling); + ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, + ProvidedFullUnrollMaxCount); // Exit early if unrolling is disabled. For OptForSize, we pick the loop size // as threshold later on. @@ -1028,10 +1067,10 @@ static LoopUnrollResult tryToUnrollLoop( return LoopUnrollResult::Unmodified; } - // When optimizing for size, use LoopSize as threshold, to (fully) unroll - // loops, if it does not increase code size. + // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold + // later), to (fully) unroll loops, if it does not increase code size. if (OptForSize) - UP.Threshold = std::max(UP.Threshold, LoopSize); + UP.Threshold = std::max(UP.Threshold, LoopSize + 1); if (NumInlineCandidates != 0) { LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); @@ -1040,7 +1079,6 @@ static LoopUnrollResult tryToUnrollLoop( // Find trip count and trip multiple if count is not available unsigned TripCount = 0; - unsigned MaxTripCount = 0; unsigned TripMultiple = 1; // If there are multiple exiting blocks but one of them is the latch, use the // latch for the trip count estimation. Otherwise insist on a single exiting @@ -1070,28 +1108,18 @@ static LoopUnrollResult tryToUnrollLoop( // Try to find the trip count upper bound if we cannot find the exact trip // count. + unsigned MaxTripCount = 0; bool MaxOrZero = false; if (!TripCount) { MaxTripCount = SE.getSmallConstantMaxTripCount(L); MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L); - // We can unroll by the upper bound amount if it's generally allowed or if - // we know that the loop is executed either the upper bound or zero times. - // (MaxOrZero unrolling keeps only the first loop test, so the number of - // loop tests remains the same compared to the non-unrolled version, whereas - // the generic upper bound unrolling keeps all but the last loop test so the - // number of loop tests goes up which may end up being worse on targets with - // constrained branch predictor resources so is controlled by an option.) - // In addition we only unroll small upper bounds. - if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) { - MaxTripCount = 0; - } } // computeUnrollCount() decides whether it is beneficial to use upper bound to // fully unroll the loop. bool UseUpperBound = false; bool IsCountSetExplicitly = computeUnrollCount( - L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, + L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) return LoopUnrollResult::Unmodified; @@ -1139,7 +1167,7 @@ static LoopUnrollResult tryToUnrollLoop( // If the loop was peeled, we already "used up" the profile information // we had, so we don't want to unroll or peel again. if (UnrollResult != LoopUnrollResult::FullyUnrolled && - (IsCountSetExplicitly || UP.PeelCount)) + (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount))) L->setLoopAlreadyUnrolled(); return UnrollResult; @@ -1169,18 +1197,24 @@ public: Optional<bool> ProvidedRuntime; Optional<bool> ProvidedUpperBound; Optional<bool> ProvidedAllowPeeling; + Optional<bool> ProvidedAllowProfileBasedPeeling; + Optional<unsigned> ProvidedFullUnrollMaxCount; LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false, bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None, Optional<unsigned> Count = None, Optional<bool> AllowPartial = None, Optional<bool> Runtime = None, Optional<bool> UpperBound = None, - Optional<bool> AllowPeeling = None) + Optional<bool> AllowPeeling = None, + Optional<bool> AllowProfileBasedPeeling = None, + Optional<unsigned> ProvidedFullUnrollMaxCount = None) : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced), ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)), ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound), - ProvidedAllowPeeling(AllowPeeling) { + ProvidedAllowPeeling(AllowPeeling), + ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling), + ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -1203,10 +1237,11 @@ public: bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); LoopUnrollResult Result = tryToUnrollLoop( - L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, - PreserveLCSSA, OptLevel, OnlyWhenForced, - ForgetAllSCEV, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, - ProvidedRuntime, ProvidedUpperBound, ProvidedAllowPeeling); + L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel, + OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold, + ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound, + ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, + ProvidedFullUnrollMaxCount); if (Result == LoopUnrollResult::FullyUnrolled) LPM.markLoopAsDeleted(*L); @@ -1283,14 +1318,16 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, std::string LoopName = L.getName(); - bool Changed = - tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, - /*BFI*/ nullptr, /*PSI*/ nullptr, - /*PreserveLCSSA*/ true, OptLevel, OnlyWhenForced, - ForgetSCEV, /*Count*/ None, - /*Threshold*/ None, /*AllowPartial*/ false, - /*Runtime*/ false, /*UpperBound*/ false, - /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified; + bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, + /*BFI*/ nullptr, /*PSI*/ nullptr, + /*PreserveLCSSA*/ true, OptLevel, + OnlyWhenForced, ForgetSCEV, /*Count*/ None, + /*Threshold*/ None, /*AllowPartial*/ false, + /*Runtime*/ false, /*UpperBound*/ false, + /*AllowPeeling*/ false, + /*AllowProfileBasedPeeling*/ false, + /*FullUnrollMaxCount*/ None) != + LoopUnrollResult::Unmodified; if (!Changed) return PreservedAnalyses::all(); @@ -1430,7 +1467,8 @@ PreservedAnalyses LoopUnrollPass::run(Function &F, /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced, UnrollOpts.ForgetSCEV, /*Count*/ None, /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, - UnrollOpts.AllowUpperBound, LocalAllowPeeling); + UnrollOpts.AllowUpperBound, LocalAllowPeeling, + UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount); Changed |= Result != LoopUnrollResult::Unmodified; // The parent must not be damaged by unrolling! diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index b5b8e720069c..915e053704b2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -59,6 +59,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -420,7 +421,8 @@ enum OperatorChain { /// cost of creating an entirely new loop. static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, OperatorChain &ParentChain, - DenseMap<Value *, Value *> &Cache) { + DenseMap<Value *, Value *> &Cache, + MemorySSAUpdater *MSSAU) { auto CacheIt = Cache.find(Cond); if (CacheIt != Cache.end()) return CacheIt->second; @@ -438,7 +440,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, // TODO: Handle: br (VARIANT|INVARIANT). // Hoist simple values out. - if (L->makeLoopInvariant(Cond, Changed)) { + if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) { Cache[Cond] = Cond; return Cond; } @@ -478,7 +480,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, // which will cause the branch to go away in one loop and the condition to // simplify in the other one. if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed, - ParentChain, Cache)) { + ParentChain, Cache, MSSAU)) { Cache[Cond] = LHS; return LHS; } @@ -486,7 +488,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, // operand(1). ParentChain = NewChain; if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed, - ParentChain, Cache)) { + ParentChain, Cache, MSSAU)) { Cache[Cond] = RHS; return RHS; } @@ -500,12 +502,12 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, /// Cond is a condition that occurs in L. If it is invariant in the loop, or has /// an invariant piece, return the invariant along with the operator chain type. /// Otherwise, return null. -static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond, - Loop *L, - bool &Changed) { +static std::pair<Value *, OperatorChain> +FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, + MemorySSAUpdater *MSSAU) { DenseMap<Value *, Value *> Cache; OperatorChain OpChain = OC_OpChainNone; - Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache); + Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU); // In case we do find a LIV, it can not be obtained by walking up a mixed // operator chain. @@ -525,7 +527,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); if (EnableMSSALoopDependency) { MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA(); - MSSAU = make_unique<MemorySSAUpdater>(MSSA); + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); assert(DT && "Cannot update MemorySSA without a valid DomTree."); } currentLoop = L; @@ -682,7 +684,7 @@ bool LoopUnswitch::processCurrentLoop() { for (auto &I : *BB) { auto CS = CallSite(&I); if (!CS) continue; - if (CS.hasFnAttr(Attribute::Convergent)) + if (CS.isConvergent()) return false; if (auto *II = dyn_cast<InvokeInst>(&I)) if (!II->getUnwindDest()->canSplitPredecessors()) @@ -694,8 +696,9 @@ bool LoopUnswitch::processCurrentLoop() { } for (IntrinsicInst *Guard : Guards) { - Value *LoopCond = - FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(Guard->getOperand(0), currentLoop, + Changed, MSSAU.get()) + .first; if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { // NB! Unswitching (if successful) could have erased some of the @@ -735,8 +738,9 @@ bool LoopUnswitch::processCurrentLoop() { if (BI->isConditional()) { // See if this, or some part of it, is loop invariant. If so, we can // unswitch on it if we desire. - Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), - currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop, + Changed, MSSAU.get()) + .first; if (LoopCond && !EqualityPropUnSafe(*LoopCond) && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) { ++NumBranches; @@ -748,7 +752,7 @@ bool LoopUnswitch::processCurrentLoop() { Value *LoopCond; OperatorChain OpChain; std::tie(LoopCond, OpChain) = - FindLIVLoopCondition(SC, currentLoop, Changed); + FindLIVLoopCondition(SC, currentLoop, Changed, MSSAU.get()); unsigned NumCases = SI->getNumCases(); if (LoopCond && NumCases) { @@ -808,8 +812,9 @@ bool LoopUnswitch::processCurrentLoop() { for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); BBI != E; ++BBI) if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) { - Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), - currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop, + Changed, MSSAU.get()) + .first; if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { ++NumSelects; @@ -1123,8 +1128,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { if (!BI->isConditional()) return false; - Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), - currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop, + Changed, MSSAU.get()) + .first; // Unswitch only if the trivial condition itself is an LIV (not // partial LIV which could occur in and/or) @@ -1157,8 +1163,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { return true; } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { // If this isn't switching on an invariant condition, we can't unswitch it. - Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), - currentLoop, Changed).first; + Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop, + Changed, MSSAU.get()) + .first; // Unswitch only if the trivial condition itself is an LIV (not // partial LIV which could occur in and/or) @@ -1240,6 +1247,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, LoopBlocks.clear(); NewBlocks.clear(); + if (MSSAU && VerifyMemorySSA) + MSSA->verifyMemorySSA(); + // First step, split the preheader and exit blocks, and add these blocks to // the LoopBlocks list. BasicBlock *NewPreheader = @@ -1607,36 +1617,30 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // If BI's parent is the only pred of the successor, fold the two blocks // together. BasicBlock *Pred = BI->getParent(); + (void)Pred; BasicBlock *Succ = BI->getSuccessor(0); BasicBlock *SinglePred = Succ->getSinglePredecessor(); if (!SinglePred) continue; // Nothing to do. assert(SinglePred == Pred && "CFG broken"); - LLVM_DEBUG(dbgs() << "Merging blocks: " << Pred->getName() << " <- " - << Succ->getName() << "\n"); - - // Resolve any single entry PHI nodes in Succ. - while (PHINode *PN = dyn_cast<PHINode>(Succ->begin())) - ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist, L, LPM, - MSSAU.get()); - - // If Succ has any successors with PHI nodes, update them to have - // entries coming from Pred instead of Succ. - Succ->replaceAllUsesWith(Pred); - - // Move all of the successor contents from Succ to Pred. - Pred->getInstList().splice(BI->getIterator(), Succ->getInstList(), - Succ->begin(), Succ->end()); - if (MSSAU) - MSSAU->moveAllAfterMergeBlocks(Succ, Pred, BI); + // Make the LPM and Worklist updates specific to LoopUnswitch. LPM->deleteSimpleAnalysisValue(BI, L); RemoveFromWorklist(BI, Worklist); - BI->eraseFromParent(); - - // Remove Succ from the loop tree. - LI->removeBlock(Succ); LPM->deleteSimpleAnalysisValue(Succ, L); - Succ->eraseFromParent(); + auto SuccIt = Succ->begin(); + while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) { + for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It) + if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It))) + Worklist.push_back(Use); + for (User *U : PN->users()) + Worklist.push_back(cast<Instruction>(U)); + LPM->deleteSimpleAnalysisValue(PN, L); + RemoveFromWorklist(PN, Worklist); + ++NumSimplify; + } + // Merge the block and make the remaining analyses updates. + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get()); ++NumSimplify; continue; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp index 896dd8bcb922..7b9af527d444 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp @@ -79,6 +79,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -112,37 +113,6 @@ static cl::opt<unsigned> LVLoopDepthThreshold( "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"), cl::init(2), cl::Hidden); -/// Create MDNode for input string. -static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) { - LLVMContext &Context = TheLoop->getHeader()->getContext(); - Metadata *MDs[] = { - MDString::get(Context, Name), - ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; - return MDNode::get(Context, MDs); -} - -/// Set input string into loop metadata by keeping other values intact. -void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *MDString, - unsigned V) { - SmallVector<Metadata *, 4> MDs(1); - // If the loop already has metadata, retain it. - MDNode *LoopID = TheLoop->getLoopID(); - if (LoopID) { - for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { - MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); - MDs.push_back(Node); - } - } - // Add new metadata. - MDs.push_back(createStringMetadata(TheLoop, MDString, V)); - // Replace current metadata node with new one. - LLVMContext &Context = TheLoop->getHeader()->getContext(); - MDNode *NewLoopID = MDNode::get(Context, MDs); - // Set operand 0 to refer to the loop id itself. - NewLoopID->replaceOperandWith(0, NewLoopID); - TheLoop->setLoopID(NewLoopID); -} - namespace { struct LoopVersioningLICM : public LoopPass { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index e076424d9042..ab7b85e89e7b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar/LowerAtomic.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp new file mode 100644 index 000000000000..21c6c32e8e02 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp @@ -0,0 +1,171 @@ +//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls +// and provides constant propagation and basic CFG cleanup on the result. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +#define DEBUG_TYPE "lower-is-constant-intrinsic" + +STATISTIC(IsConstantIntrinsicsHandled, + "Number of 'is.constant' intrinsic calls handled"); +STATISTIC(ObjectSizeIntrinsicsHandled, + "Number of 'objectsize' intrinsic calls handled"); + +static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) { + Value *Op = II->getOperand(0); + + return isa<Constant>(Op) ? ConstantInt::getTrue(II->getType()) + : ConstantInt::getFalse(II->getType()); +} + +static bool replaceConditionalBranchesOnConstant(Instruction *II, + Value *NewValue) { + bool HasDeadBlocks = false; + SmallSetVector<Instruction *, 8> Worklist; + replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr, + &Worklist); + for (auto I : Worklist) { + BranchInst *BI = dyn_cast<BranchInst>(I); + if (!BI) + continue; + if (BI->isUnconditional()) + continue; + + BasicBlock *Target, *Other; + if (match(BI->getOperand(0), m_Zero())) { + Target = BI->getSuccessor(1); + Other = BI->getSuccessor(0); + } else if (match(BI->getOperand(0), m_One())) { + Target = BI->getSuccessor(0); + Other = BI->getSuccessor(1); + } else { + Target = nullptr; + Other = nullptr; + } + if (Target && Target != Other) { + BasicBlock *Source = BI->getParent(); + Other->removePredecessor(Source); + BI->eraseFromParent(); + BranchInst::Create(Target, Source); + if (pred_begin(Other) == pred_end(Other)) + HasDeadBlocks = true; + } + } + return HasDeadBlocks; +} + +static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) { + bool HasDeadBlocks = false; + const auto &DL = F.getParent()->getDataLayout(); + SmallVector<WeakTrackingVH, 8> Worklist; + + ReversePostOrderTraversal<Function *> RPOT(&F); + for (BasicBlock *BB : RPOT) { + for (Instruction &I: *BB) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (!II) + continue; + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::is_constant: + case Intrinsic::objectsize: + Worklist.push_back(WeakTrackingVH(&I)); + break; + } + } + } + for (WeakTrackingVH &VH: Worklist) { + // Items on the worklist can be mutated by earlier recursive replaces. + // This can remove the intrinsic as dead (VH == null), but also replace + // the intrinsic in place. + if (!VH) + continue; + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH); + if (!II) + continue; + Value *NewValue; + switch (II->getIntrinsicID()) { + default: + continue; + case Intrinsic::is_constant: + NewValue = lowerIsConstantIntrinsic(II); + IsConstantIntrinsicsHandled++; + break; + case Intrinsic::objectsize: + NewValue = lowerObjectSizeCall(II, DL, TLI, true); + ObjectSizeIntrinsicsHandled++; + break; + } + HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue); + } + if (HasDeadBlocks) + removeUnreachableBlocks(F); + return !Worklist.empty(); +} + +PreservedAnalyses +LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) { + if (lowerConstantIntrinsics(F, AM.getCachedResult<TargetLibraryAnalysis>(F))) + return PreservedAnalyses::none(); + + return PreservedAnalyses::all(); +} + +namespace { +/// Legacy pass for lowering is.constant intrinsics out of the IR. +/// +/// When this pass is run over a function it converts is.constant intrinsics +/// into 'true' or 'false'. This is completements the normal constand folding +/// to 'true' as part of Instruction Simplify passes. +class LowerConstantIntrinsics : public FunctionPass { +public: + static char ID; + LowerConstantIntrinsics() : FunctionPass(ID) { + initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); + const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; + return lowerConstantIntrinsics(F, TLI); + } +}; +} // namespace + +char LowerConstantIntrinsics::ID = 0; +INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics", + "Lower constant intrinsics", false, false) + +FunctionPass *llvm::createLowerConstantIntrinsicsPass() { + return new LowerConstantIntrinsics(); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 0d67c0d740ec..53671c7bc3d1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -22,10 +22,12 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/MisExpect.h" using namespace llvm; @@ -71,15 +73,20 @@ static bool handleSwitchExpect(SwitchInst &SI) { unsigned n = SI.getNumCases(); // +1 for default case. SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight); - if (Case == *SI.case_default()) - Weights[0] = LikelyBranchWeight; - else - Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight; + uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1; + Weights[Index] = LikelyBranchWeight; + + SI.setMetadata( + LLVMContext::MD_misexpect, + MDBuilder(CI->getContext()) + .createMisExpect(Index, LikelyBranchWeight, UnlikelyBranchWeight)); + + SI.setCondition(ArgValue); + misexpect::checkFrontendInstrumentation(SI); SI.setMetadata(LLVMContext::MD_prof, MDBuilder(CI->getContext()).createBranchWeights(Weights)); - SI.setCondition(ArgValue); return true; } @@ -155,7 +162,7 @@ static void handlePhiDef(CallInst *Expect) { return Result; }; - auto *PhiDef = dyn_cast<PHINode>(V); + auto *PhiDef = cast<PHINode>(V); // Get the first dominating conditional branch of the operand // i's incoming block. @@ -280,19 +287,28 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { MDBuilder MDB(CI->getContext()); MDNode *Node; + MDNode *ExpNode; if ((ExpectedValue->getZExtValue() == ValueComparedTo) == - (Predicate == CmpInst::ICMP_EQ)) + (Predicate == CmpInst::ICMP_EQ)) { Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight); - else + ExpNode = MDB.createMisExpect(0, LikelyBranchWeight, UnlikelyBranchWeight); + } else { Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); + ExpNode = MDB.createMisExpect(1, LikelyBranchWeight, UnlikelyBranchWeight); + } - BSI.setMetadata(LLVMContext::MD_prof, Node); + BSI.setMetadata(LLVMContext::MD_misexpect, ExpNode); if (CmpI) CmpI->setOperand(0, ArgValue); else BSI.setCondition(ArgValue); + + misexpect::checkFrontendInstrumentation(BSI); + + BSI.setMetadata(LLVMContext::MD_prof, Node); + return true; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp index 9489e01774d6..45f5929e3b90 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" @@ -60,7 +61,7 @@ static bool lowerGuardIntrinsic(Function &F) { DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); for (auto *CI : ToLower) { - makeGuardControlFlowExplicit(DeoptIntrinsic, CI); + makeGuardControlFlowExplicit(DeoptIntrinsic, CI, false); CI->eraseFromParent(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp new file mode 100644 index 000000000000..0ff6ee8bcfcc --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -0,0 +1,894 @@ +//===- LowerMatrixIntrinsics.cpp - Lower matrix intrinsics -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lower matrix intrinsics to vector operations. +// +// TODO: +// * Implement multiply & add fusion +// * Add remark, summarizing the available matrix optimization opportunities. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "lower-matrix-intrinsics" + +static cl::opt<bool> EnableShapePropagation("matrix-propagate-shape", + cl::init(true)); + +static cl::opt<bool> AllowContractEnabled( + "matrix-allow-contract", cl::init(false), cl::Hidden, + cl::desc("Allow the use of FMAs if available and profitable. This may " + "result in different results, due to less rounding error.")); + +namespace { + +// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute +// the start address of column \p Col with type (\p EltType x \p NumRows) +// assuming \p Stride elements between start two consecutive columns. +// \p Stride must be >= \p NumRows. +// +// Consider a 4x4 matrix like below +// +// 0 1 2 3 +// 0 v_0_0 v_0_1 v_0_2 v_0_3 +// 1 v_1_0 v_1_1 v_1_2 v_1_3 +// 2 v_2_0 v_2_1 v_2_2 v_2_3 +// 3 v_3_0 v_3_1 v_3_2 v_3_3 + +// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1, +// we need a pointer to the first element of the submatrix as base pointer. +// Then we can use computeColumnAddr to compute the addresses for the columns +// of the sub-matrix. +// +// Column 0: computeColumnAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..) +// -> just returns Base +// Column 1: computeColumnAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..) +// -> returns Base + (1 * 4) +// Column 2: computeColumnAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..) +// -> returns Base + (2 * 4) +// +// The graphic below illustrates the number of elements in a column (marked +// with |) and the number of skipped elements (marked with }). +// +// v_0_0 v_0_1 {v_0_2 {v_0_3 +// Base Col 1 Col 2 +// | | | +// v_1_0 |v_1_1 |v_1_2 |v_1_3 +// v_2_0 |v_2_1 |v_2_2 |v_2_3 +// v_3_0 {v_3_1 {v_3_2 v_3_3 +// +Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride, + unsigned NumRows, Type *EltType, + IRBuilder<> &Builder) { + + assert((!isa<ConstantInt>(Stride) || + cast<ConstantInt>(Stride)->getZExtValue() >= NumRows) && + "Stride must be >= the number of rows."); + unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); + + // Compute the start of the column with index Col as Col * Stride. + Value *ColumnStart = Builder.CreateMul(Col, Stride, "col.start"); + + // Get pointer to the start of the selected column. Skip GEP creation, + // if we select column 0. + if (isa<ConstantInt>(ColumnStart) && cast<ConstantInt>(ColumnStart)->isZero()) + ColumnStart = BasePtr; + else + ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart, "col.gep"); + + // Cast elementwise column start pointer to a pointer to a column + // (EltType x NumRows)*. + Type *ColumnType = VectorType::get(EltType, NumRows); + Type *ColumnPtrType = PointerType::get(ColumnType, AS); + return Builder.CreatePointerCast(ColumnStart, ColumnPtrType, "col.cast"); +} + +/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics. +/// +/// Currently, the lowering for each matrix intrinsic is done as follows: +/// 1. Propagate the shape information from intrinsics to connected +/// instructions. +/// 2. Lower instructions with shape information. +/// 2.1. Get column vectors for each argument. If we already lowered the +/// definition of an argument, use the produced column vectors directly. +/// If not, split the operand vector containing an embedded matrix into +/// a set of column vectors, +/// 2.2. Lower the instruction in terms of columnwise operations, which yields +/// a set of column vectors containing result matrix. Note that we lower +/// all instructions that have shape information. Besides the intrinsics, +/// this includes stores for example. +/// 2.3. Update uses of the lowered instruction. If we have shape information +/// for a user, there is nothing to do, as we will look up the result +/// column matrix when lowering the user. For other uses, we embed the +/// result matrix in a flat vector and update the use. +/// 2.4. Cache the result column matrix for the instruction we lowered +/// 3. After we lowered all instructions in a function, remove the now +/// obsolete instructions. +/// +class LowerMatrixIntrinsics { + Function &Func; + const DataLayout &DL; + const TargetTransformInfo &TTI; + + /// Wrapper class representing a matrix as a set of column vectors. + /// All column vectors must have the same vector type. + class ColumnMatrixTy { + SmallVector<Value *, 16> Columns; + + public: + ColumnMatrixTy() : Columns() {} + ColumnMatrixTy(ArrayRef<Value *> Cols) + : Columns(Cols.begin(), Cols.end()) {} + + Value *getColumn(unsigned i) const { return Columns[i]; } + + void setColumn(unsigned i, Value *V) { Columns[i] = V; } + + size_t getNumColumns() const { return Columns.size(); } + size_t getNumRows() const { + assert(Columns.size() > 0 && "Cannot call getNumRows without columns"); + return cast<VectorType>(Columns[0]->getType())->getNumElements(); + } + + const SmallVectorImpl<Value *> &getColumnVectors() const { return Columns; } + + SmallVectorImpl<Value *> &getColumnVectors() { return Columns; } + + void addColumn(Value *V) { Columns.push_back(V); } + + iterator_range<SmallVector<Value *, 8>::iterator> columns() { + return make_range(Columns.begin(), Columns.end()); + } + + /// Embed the columns of the matrix into a flat vector by concatenating + /// them. + Value *embedInVector(IRBuilder<> &Builder) const { + return Columns.size() == 1 ? Columns[0] + : concatenateVectors(Builder, Columns); + } + }; + + struct ShapeInfo { + unsigned NumRows; + unsigned NumColumns; + + ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) + : NumRows(NumRows), NumColumns(NumColumns) {} + + ShapeInfo(Value *NumRows, Value *NumColumns) + : NumRows(cast<ConstantInt>(NumRows)->getZExtValue()), + NumColumns(cast<ConstantInt>(NumColumns)->getZExtValue()) {} + + bool operator==(const ShapeInfo &other) { + return NumRows == other.NumRows && NumColumns == other.NumColumns; + } + bool operator!=(const ShapeInfo &other) { return !(*this == other); } + + /// Returns true if shape-information is defined, meaning both dimensions + /// are != 0. + operator bool() const { + assert(NumRows == 0 || NumColumns != 0); + return NumRows != 0; + } + }; + + /// Maps instructions to their shape information. The shape information + /// describes the shape to be used while lowering. This matches the shape of + /// the result value of the instruction, with the only exceptions being store + /// instructions and the matrix_columnwise_store intrinsics. For those, the + /// shape information indicates that those instructions should be lowered + /// using shape information as well. + DenseMap<Value *, ShapeInfo> ShapeMap; + + /// List of instructions to remove. While lowering, we are not replacing all + /// users of a lowered instruction, if shape information is available and + /// those need to be removed after we finished lowering. + SmallVector<Instruction *, 16> ToRemove; + + /// Map from instructions to their produced column matrix. + DenseMap<Value *, ColumnMatrixTy> Inst2ColumnMatrix; + +public: + LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI) + : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {} + + /// Return the set of column vectors that a matrix value is lowered to. + /// + /// If we lowered \p MatrixVal, just return the cache result column matrix. + /// Otherwie split the flat vector \p MatrixVal containing a matrix with + /// shape \p SI into column vectors. + ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, + IRBuilder<> Builder) { + VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType()); + assert(VType && "MatrixVal must be a vector type"); + assert(VType->getNumElements() == SI.NumRows * SI.NumColumns && + "The vector size must match the number of matrix elements"); + + // Check if we lowered MatrixVal using shape information. In that case, + // return the existing column matrix, if it matches the requested shape + // information. If there is a mis-match, embed the result in a flat + // vector and split it later. + auto Found = Inst2ColumnMatrix.find(MatrixVal); + if (Found != Inst2ColumnMatrix.end()) { + ColumnMatrixTy &M = Found->second; + // Return the found matrix, if its shape matches the requested shape + // information + if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns()) + return M; + + MatrixVal = M.embedInVector(Builder); + } + + // Otherwise split MatrixVal. + SmallVector<Value *, 16> SplitVecs; + Value *Undef = UndefValue::get(VType); + for (unsigned MaskStart = 0; MaskStart < VType->getNumElements(); + MaskStart += SI.NumRows) { + Constant *Mask = createSequentialMask(Builder, MaskStart, SI.NumRows, 0); + Value *V = Builder.CreateShuffleVector(MatrixVal, Undef, Mask, "split"); + SplitVecs.push_back(V); + } + + return {SplitVecs}; + } + + /// If \p V already has a known shape return false. Otherwise set the shape + /// for instructions that support it. + bool setShapeInfo(Value *V, ShapeInfo Shape) { + assert(Shape && "Shape not set"); + if (isa<UndefValue>(V) || !supportsShapeInfo(V)) + return false; + + auto SIter = ShapeMap.find(V); + if (SIter != ShapeMap.end()) { + LLVM_DEBUG(dbgs() << " not overriding existing shape: " + << SIter->second.NumRows << " " + << SIter->second.NumColumns << " for " << *V << "\n"); + return false; + } + + ShapeMap.insert({V, Shape}); + LLVM_DEBUG(dbgs() << " " << Shape.NumRows << " x " << Shape.NumColumns + << " for " << *V << "\n"); + return true; + } + + bool isUniformShape(Value *V) { + Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return true; + + switch (I->getOpcode()) { + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: // Scalar multiply. + case Instruction::Add: + case Instruction::Mul: + case Instruction::Sub: + return true; + default: + return false; + } + } + + /// Returns true if shape information can be used for \p V. The supported + /// instructions must match the instructions that can be lowered by this pass. + bool supportsShapeInfo(Value *V) { + Instruction *Inst = dyn_cast<Instruction>(V); + if (!Inst) + return false; + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); + if (II) + switch (II->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + case Intrinsic::matrix_transpose: + case Intrinsic::matrix_columnwise_load: + case Intrinsic::matrix_columnwise_store: + return true; + default: + return false; + } + return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V); + } + + /// Propagate the shape information of instructions to their users. + /// The work list contains instructions for which we can compute the shape, + /// either based on the information provided by matrix intrinsics or known + /// shapes of operands. + SmallVector<Instruction *, 32> + propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) { + SmallVector<Instruction *, 32> NewWorkList; + // Pop an element for which we guaranteed to have at least one of the + // operand shapes. Add the shape for this and then add users to the work + // list. + LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n"); + while (!WorkList.empty()) { + Instruction *Inst = WorkList.back(); + WorkList.pop_back(); + + // New entry, set the value and insert operands + bool Propagate = false; + + Value *MatrixA; + Value *MatrixB; + Value *M; + Value *N; + Value *K; + if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>( + m_Value(MatrixA), m_Value(MatrixB), m_Value(M), + m_Value(N), m_Value(K)))) { + Propagate = setShapeInfo(Inst, {M, K}); + } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>( + m_Value(MatrixA), m_Value(M), m_Value(N)))) { + // Flip dimensions. + Propagate = setShapeInfo(Inst, {N, M}); + } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_columnwise_store>( + m_Value(MatrixA), m_Value(), m_Value(), + m_Value(M), m_Value(N)))) { + Propagate = setShapeInfo(Inst, {N, M}); + } else if (match(Inst, + m_Intrinsic<Intrinsic::matrix_columnwise_load>( + m_Value(), m_Value(), m_Value(M), m_Value(N)))) { + Propagate = setShapeInfo(Inst, {M, N}); + } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) { + auto OpShape = ShapeMap.find(MatrixA); + if (OpShape != ShapeMap.end()) + setShapeInfo(Inst, OpShape->second); + continue; + } else if (isUniformShape(Inst)) { + // Find the first operand that has a known shape and use that. + for (auto &Op : Inst->operands()) { + auto OpShape = ShapeMap.find(Op.get()); + if (OpShape != ShapeMap.end()) { + Propagate |= setShapeInfo(Inst, OpShape->second); + break; + } + } + } + + if (Propagate) { + NewWorkList.push_back(Inst); + for (auto *User : Inst->users()) + if (ShapeMap.count(User) == 0) + WorkList.push_back(cast<Instruction>(User)); + } + } + + return NewWorkList; + } + + /// Propagate the shape to operands of instructions with shape information. + /// \p Worklist contains the instruction for which we already know the shape. + SmallVector<Instruction *, 32> + propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) { + SmallVector<Instruction *, 32> NewWorkList; + + auto pushInstruction = [](Value *V, + SmallVectorImpl<Instruction *> &WorkList) { + Instruction *I = dyn_cast<Instruction>(V); + if (I) + WorkList.push_back(I); + }; + // Pop an element with known shape. Traverse the operands, if their shape + // derives from the result shape and is unknown, add it and add them to the + // worklist. + LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n"); + while (!WorkList.empty()) { + Value *V = WorkList.back(); + WorkList.pop_back(); + + size_t BeforeProcessingV = WorkList.size(); + if (!isa<Instruction>(V)) + continue; + + Value *MatrixA; + Value *MatrixB; + Value *M; + Value *N; + Value *K; + if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>( + m_Value(MatrixA), m_Value(MatrixB), m_Value(M), + m_Value(N), m_Value(K)))) { + if (setShapeInfo(MatrixA, {M, N})) + pushInstruction(MatrixA, WorkList); + + if (setShapeInfo(MatrixB, {N, K})) + pushInstruction(MatrixB, WorkList); + + } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>( + m_Value(MatrixA), m_Value(M), m_Value(N)))) { + // Flip dimensions. + if (setShapeInfo(MatrixA, {M, N})) + pushInstruction(MatrixA, WorkList); + } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>( + m_Value(MatrixA), m_Value(), m_Value(), + m_Value(M), m_Value(N)))) { + if (setShapeInfo(MatrixA, {M, N})) { + pushInstruction(MatrixA, WorkList); + } + } else if (isa<LoadInst>(V) || + match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) { + // Nothing to do, no matrix input. + } else if (isa<StoreInst>(V)) { + // Nothing to do. We forward-propagated to this so we would just + // backward propagate to an instruction with an already known shape. + } else if (isUniformShape(V)) { + // Propagate to all operands. + ShapeInfo Shape = ShapeMap[V]; + for (Use &U : cast<Instruction>(V)->operands()) { + if (setShapeInfo(U.get(), Shape)) + pushInstruction(U.get(), WorkList); + } + } + // After we discovered new shape info for new instructions in the + // worklist, we use their users as seeds for the next round of forward + // propagation. + for (size_t I = BeforeProcessingV; I != WorkList.size(); I++) + for (User *U : WorkList[I]->users()) + if (isa<Instruction>(U) && V != U) + NewWorkList.push_back(cast<Instruction>(U)); + } + return NewWorkList; + } + + bool Visit() { + if (EnableShapePropagation) { + SmallVector<Instruction *, 32> WorkList; + + // Initially only the shape of matrix intrinsics is known. + // Initialize the work list with ops carrying shape information. + for (BasicBlock &BB : Func) + for (Instruction &Inst : BB) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst); + if (!II) + continue; + + switch (II->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + case Intrinsic::matrix_transpose: + case Intrinsic::matrix_columnwise_load: + case Intrinsic::matrix_columnwise_store: + WorkList.push_back(&Inst); + break; + default: + break; + } + } + // Propagate shapes until nothing changes any longer. + while (!WorkList.empty()) { + WorkList = propagateShapeForward(WorkList); + WorkList = propagateShapeBackward(WorkList); + } + } + + ReversePostOrderTraversal<Function *> RPOT(&Func); + bool Changed = false; + for (auto *BB : RPOT) { + for (Instruction &Inst : make_early_inc_range(*BB)) { + IRBuilder<> Builder(&Inst); + + if (CallInst *CInst = dyn_cast<CallInst>(&Inst)) + Changed |= VisitCallInst(CInst); + + Value *Op1; + Value *Op2; + if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst)) + Changed |= VisitBinaryOperator(BinOp); + if (match(&Inst, m_Load(m_Value(Op1)))) + Changed |= VisitLoad(&Inst, Op1, Builder); + else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2)))) + Changed |= VisitStore(&Inst, Op1, Op2, Builder); + } + } + + for (Instruction *Inst : reverse(ToRemove)) + Inst->eraseFromParent(); + + return Changed; + } + + LoadInst *createColumnLoad(Value *ColumnPtr, Type *EltType, + IRBuilder<> Builder) { + unsigned Align = DL.getABITypeAlignment(EltType); + return Builder.CreateAlignedLoad(ColumnPtr, Align, "col.load"); + } + + StoreInst *createColumnStore(Value *ColumnValue, Value *ColumnPtr, + Type *EltType, IRBuilder<> Builder) { + unsigned Align = DL.getABITypeAlignment(EltType); + return Builder.CreateAlignedStore(ColumnValue, ColumnPtr, Align); + } + + + /// Turns \p BasePtr into an elementwise pointer to \p EltType. + Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) { + unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace(); + Type *EltPtrType = PointerType::get(EltType, AS); + return Builder.CreatePointerCast(BasePtr, EltPtrType); + } + + /// Replace intrinsic calls + bool VisitCallInst(CallInst *Inst) { + if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic()) + return false; + + switch (Inst->getCalledFunction()->getIntrinsicID()) { + case Intrinsic::matrix_multiply: + LowerMultiply(Inst); + break; + case Intrinsic::matrix_transpose: + LowerTranspose(Inst); + break; + case Intrinsic::matrix_columnwise_load: + LowerColumnwiseLoad(Inst); + break; + case Intrinsic::matrix_columnwise_store: + LowerColumnwiseStore(Inst); + break; + default: + return false; + } + return true; + } + + void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); + auto VType = cast<VectorType>(Inst->getType()); + Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + ColumnMatrixTy Result; + // Distance between start of one column and the start of the next + for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) { + Value *GEP = + computeColumnAddr(EltPtr, Builder.getInt32(C), Stride, Shape.NumRows, + VType->getElementType(), Builder); + Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder); + Result.addColumn(Column); + } + + finalizeLowering(Inst, Result, Builder); + } + + /// Lowers llvm.matrix.columnwise.load. + /// + /// The intrinsic loads a matrix from memory using a stride between columns. + void LowerColumnwiseLoad(CallInst *Inst) { + Value *Ptr = Inst->getArgOperand(0); + Value *Stride = Inst->getArgOperand(1); + LowerLoad(Inst, Ptr, Stride, + {Inst->getArgOperand(2), Inst->getArgOperand(3)}); + } + + void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); + auto VType = cast<VectorType>(Matrix->getType()); + Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); + auto LM = getMatrix(Matrix, Shape, Builder); + for (auto C : enumerate(LM.columns())) { + Value *GEP = + computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride, + Shape.NumRows, VType->getElementType(), Builder); + createColumnStore(C.value(), GEP, VType->getElementType(), Builder); + } + + ToRemove.push_back(Inst); + } + + /// Lowers llvm.matrix.columnwise.store. + /// + /// The intrinsic store a matrix back memory using a stride between columns. + void LowerColumnwiseStore(CallInst *Inst) { + Value *Matrix = Inst->getArgOperand(0); + Value *Ptr = Inst->getArgOperand(1); + Value *Stride = Inst->getArgOperand(2); + LowerStore(Inst, Matrix, Ptr, Stride, + {Inst->getArgOperand(3), Inst->getArgOperand(4)}); + } + + /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from + /// the matrix \p LM represented as a vector of column vectors. + Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J, + unsigned NumElts, IRBuilder<> Builder) { + Value *Col = LM.getColumn(J); + Value *Undef = UndefValue::get(Col->getType()); + Constant *Mask = createSequentialMask(Builder, I, NumElts, 0); + return Builder.CreateShuffleVector(Col, Undef, Mask, "block"); + } + + // Set elements I..I+NumElts-1 to Block + Value *insertVector(Value *Col, unsigned I, Value *Block, + IRBuilder<> Builder) { + + // First, bring Block to the same size as Col + unsigned BlockNumElts = + cast<VectorType>(Block->getType())->getNumElements(); + unsigned NumElts = cast<VectorType>(Col->getType())->getNumElements(); + assert(NumElts >= BlockNumElts && "Too few elements for current block"); + + Value *ExtendMask = + createSequentialMask(Builder, 0, BlockNumElts, NumElts - BlockNumElts); + Value *Undef = UndefValue::get(Block->getType()); + Block = Builder.CreateShuffleVector(Block, Undef, ExtendMask); + + // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7, + // 8, 4, 5, 6 + SmallVector<Constant *, 16> Mask; + unsigned i; + for (i = 0; i < I; i++) + Mask.push_back(Builder.getInt32(i)); + + unsigned VecNumElts = cast<VectorType>(Col->getType())->getNumElements(); + for (; i < I + BlockNumElts; i++) + Mask.push_back(Builder.getInt32(i - I + VecNumElts)); + + for (; i < VecNumElts; i++) + Mask.push_back(Builder.getInt32(i)); + + Value *MaskVal = ConstantVector::get(Mask); + + return Builder.CreateShuffleVector(Col, Block, MaskVal); + } + + Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, + IRBuilder<> &Builder, bool AllowContraction) { + + if (!Sum) + return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); + + if (UseFPOp) { + if (AllowContraction) { + // Use fmuladd for floating point operations and let the backend decide + // if that's profitable. + Value *FMulAdd = Intrinsic::getDeclaration( + Func.getParent(), Intrinsic::fmuladd, A->getType()); + return Builder.CreateCall(FMulAdd, {A, B, Sum}); + } + Value *Mul = Builder.CreateFMul(A, B); + return Builder.CreateFAdd(Sum, Mul); + } + + Value *Mul = Builder.CreateMul(A, B); + return Builder.CreateAdd(Sum, Mul); + } + + /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For + /// users with shape information, there's nothing to do: the will use the + /// cached value when they are lowered. For other users, \p Matrix is + /// flattened and the uses are updated to use it. Also marks \p Inst for + /// deletion. + void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix, + IRBuilder<> &Builder) { + Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix)); + + ToRemove.push_back(Inst); + Value *Flattened = nullptr; + for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) { + Use &U = *I++; + if (ShapeMap.find(U.getUser()) == ShapeMap.end()) { + if (!Flattened) + Flattened = Matrix.embedInVector(Builder); + U.set(Flattened); + } + } + } + + /// Lowers llvm.matrix.multiply. + void LowerMultiply(CallInst *MatMul) { + IRBuilder<> Builder(MatMul); + auto *EltType = cast<VectorType>(MatMul->getType())->getElementType(); + ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); + ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); + + const ColumnMatrixTy &Lhs = + getMatrix(MatMul->getArgOperand(0), LShape, Builder); + const ColumnMatrixTy &Rhs = + getMatrix(MatMul->getArgOperand(1), RShape, Builder); + + const unsigned R = LShape.NumRows; + const unsigned M = LShape.NumColumns; + const unsigned C = RShape.NumColumns; + assert(M == RShape.NumRows); + + // Initialize the output + ColumnMatrixTy Result; + for (unsigned J = 0; J < C; ++J) + Result.addColumn(UndefValue::get(VectorType::get(EltType, R))); + + const unsigned VF = std::max(TTI.getRegisterBitWidth(true) / + EltType->getPrimitiveSizeInBits(), + uint64_t(1)); + + bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) && + MatMul->hasAllowContract()); + // Multiply columns from the first operand with scalars from the second + // operand. Then move along the K axes and accumulate the columns. With + // this the adds can be vectorized without reassociation. + for (unsigned J = 0; J < C; ++J) { + unsigned BlockSize = VF; + for (unsigned I = 0; I < R; I += BlockSize) { + // Gradually lower the vectorization factor to cover the remainder. + while (I + BlockSize > R) + BlockSize /= 2; + + Value *Sum = nullptr; + for (unsigned K = 0; K < M; ++K) { + Value *L = extractVector(Lhs, I, K, BlockSize, Builder); + Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K); + Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); + Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(), + Builder, AllowContract); + } + Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder)); + } + } + finalizeLowering(MatMul, Result, Builder); + } + + /// Lowers llvm.matrix.transpose. + void LowerTranspose(CallInst *Inst) { + ColumnMatrixTy Result; + IRBuilder<> Builder(Inst); + Value *InputVal = Inst->getArgOperand(0); + VectorType *VectorTy = cast<VectorType>(InputVal->getType()); + ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2)); + ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); + + for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) { + // Build a single column vector for this row. First initialize it. + Value *ResultColumn = UndefValue::get( + VectorType::get(VectorTy->getElementType(), ArgShape.NumColumns)); + + // Go through the elements of this row and insert it into the resulting + // column vector. + for (auto C : enumerate(InputMatrix.columns())) { + Value *Elt = Builder.CreateExtractElement(C.value(), Row); + // We insert at index Column since that is the row index after the + // transpose. + ResultColumn = + Builder.CreateInsertElement(ResultColumn, Elt, C.index()); + } + Result.addColumn(ResultColumn); + } + + finalizeLowering(Inst, Result, Builder); + } + + /// Lower load instructions, if shape information is available. + bool VisitLoad(Instruction *Inst, Value *Ptr, IRBuilder<> &Builder) { + auto I = ShapeMap.find(Inst); + if (I == ShapeMap.end()) + return false; + + LowerLoad(Inst, Ptr, Builder.getInt32(I->second.NumRows), I->second); + return true; + } + + bool VisitStore(Instruction *Inst, Value *StoredVal, Value *Ptr, + IRBuilder<> &Builder) { + auto I = ShapeMap.find(StoredVal); + if (I == ShapeMap.end()) + return false; + + LowerStore(Inst, StoredVal, Ptr, Builder.getInt32(I->second.NumRows), I->second); + return true; + } + + /// Lower binary operators, if shape information is available. + bool VisitBinaryOperator(BinaryOperator *Inst) { + auto I = ShapeMap.find(Inst); + if (I == ShapeMap.end()) + return false; + + Value *Lhs = Inst->getOperand(0); + Value *Rhs = Inst->getOperand(1); + + IRBuilder<> Builder(Inst); + ShapeInfo &Shape = I->second; + + ColumnMatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder); + ColumnMatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder); + + // Add each column and store the result back into the opmapping + ColumnMatrixTy Result; + auto BuildColumnOp = [&Builder, Inst](Value *LHS, Value *RHS) { + switch (Inst->getOpcode()) { + case Instruction::Add: + return Builder.CreateAdd(LHS, RHS); + case Instruction::Mul: + return Builder.CreateMul(LHS, RHS); + case Instruction::Sub: + return Builder.CreateSub(LHS, RHS); + case Instruction::FAdd: + return Builder.CreateFAdd(LHS, RHS); + case Instruction::FMul: + return Builder.CreateFMul(LHS, RHS); + case Instruction::FSub: + return Builder.CreateFSub(LHS, RHS); + default: + llvm_unreachable("Unsupported binary operator for matrix"); + } + }; + for (unsigned C = 0; C < Shape.NumColumns; ++C) + Result.addColumn( + BuildColumnOp(LoweredLhs.getColumn(C), LoweredRhs.getColumn(C))); + + finalizeLowering(Inst, Result, Builder); + return true; + } +}; +} // namespace + +PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + LowerMatrixIntrinsics LMT(F, TTI); + if (LMT.Visit()) { + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; + } + return PreservedAnalyses::all(); +} + +namespace { + +class LowerMatrixIntrinsicsLegacyPass : public FunctionPass { +public: + static char ID; + + LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) { + initializeLowerMatrixIntrinsicsLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + LowerMatrixIntrinsics LMT(F, *TTI); + bool C = LMT.Visit(); + return C; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.setPreservesCFG(); + } +}; +} // namespace + +static const char pass_name[] = "Lower the matrix intrinsics"; +char LowerMatrixIntrinsicsLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, + false, false) +INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name, + false, false) + +Pass *llvm::createLowerMatrixIntrinsicsPass() { + return new LowerMatrixIntrinsicsLegacyPass(); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp index 5342f2ddcb6b..73b2cd06fa23 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp index 789232e0f5ce..5ffae128f5f0 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -33,10 +33,11 @@ #include "llvm/Transforms/Scalar/MakeGuardsExplicit.h" #include "llvm/Analysis/GuardUtils.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/GuardUtils.h" @@ -56,23 +57,11 @@ struct MakeGuardsExplicitLegacyPass : public FunctionPass { static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) { // Replace the guard with an explicit branch (just like in GuardWidening). - BasicBlock *BB = Guard->getParent(); - makeGuardControlFlowExplicit(DeoptIntrinsic, Guard); - BranchInst *ExplicitGuard = cast<BranchInst>(BB->getTerminator()); - assert(ExplicitGuard->isConditional() && "Must be!"); + BasicBlock *OriginalBB = Guard->getParent(); + (void)OriginalBB; + makeGuardControlFlowExplicit(DeoptIntrinsic, Guard, true); + assert(isWidenableBranch(OriginalBB->getTerminator()) && "should hold"); - // We want the guard to be expressed as explicit control flow, but still be - // widenable. For that, we add Widenable Condition intrinsic call to the - // guard's condition. - IRBuilder<> B(ExplicitGuard); - auto *WidenableCondition = - B.CreateIntrinsic(Intrinsic::experimental_widenable_condition, - {}, {}, nullptr, "widenable_cond"); - WidenableCondition->setCallingConv(Guard->getCallingConv()); - auto *NewCond = - B.CreateAnd(ExplicitGuard->getCondition(), WidenableCondition); - NewCond->setName("exiplicit_guard_cond"); - ExplicitGuard->setCondition(NewCond); Guard->eraseFromParent(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index bb5ec253cbf2..c24fa40860eb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -24,7 +24,6 @@ #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -49,12 +48,14 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <cstdint> @@ -69,90 +70,6 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); -static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, - bool &VariableIdxFound, - const DataLayout &DL) { - // Skip over the first indices. - gep_type_iterator GTI = gep_type_begin(GEP); - for (unsigned i = 1; i != Idx; ++i, ++GTI) - /*skip along*/; - - // Compute the offset implied by the rest of the indices. - int64_t Offset = 0; - for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { - ConstantInt *OpC = dyn_cast<ConstantInt>(GEP->getOperand(i)); - if (!OpC) - return VariableIdxFound = true; - if (OpC->isZero()) continue; // No offset. - - // Handle struct indices, which add their field offset to the pointer. - if (StructType *STy = GTI.getStructTypeOrNull()) { - Offset += DL.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); - continue; - } - - // Otherwise, we have a sequential type like an array or vector. Multiply - // the index by the ElementSize. - uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); - Offset += Size*OpC->getSExtValue(); - } - - return Offset; -} - -/// Return true if Ptr1 is provably equal to Ptr2 plus a constant offset, and -/// return that constant offset. For example, Ptr1 might be &A[42], and Ptr2 -/// might be &A[40]. In this case offset would be -8. -static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - const DataLayout &DL) { - Ptr1 = Ptr1->stripPointerCasts(); - Ptr2 = Ptr2->stripPointerCasts(); - - // Handle the trivial case first. - if (Ptr1 == Ptr2) { - Offset = 0; - return true; - } - - GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1); - GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2); - - bool VariableIdxFound = false; - - // If one pointer is a GEP and the other isn't, then see if the GEP is a - // constant offset from the base, as in "P" and "gep P, 1". - if (GEP1 && !GEP2 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { - Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, DL); - return !VariableIdxFound; - } - - if (GEP2 && !GEP1 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { - Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, DL); - return !VariableIdxFound; - } - - // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical - // base. After that base, they may have some number of common (and - // potentially variable) indices. After that they handle some constant - // offset, which determines their offset from each other. At this point, we - // handle no other case. - if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) - return false; - - // Skip any common indices and track the GEP types. - unsigned Idx = 1; - for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) - if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) - break; - - int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, DL); - int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, DL); - if (VariableIdxFound) return false; - - Offset = Offset2-Offset1; - return true; -} - namespace { /// Represents a range of memset'd bytes with the ByteVal value. @@ -419,12 +336,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, break; // Check to see if this store is to a constant offset from the start ptr. - int64_t Offset; - if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, - DL)) + Optional<int64_t> Offset = + isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL); + if (!Offset) break; - Ranges.addStore(Offset, NextStore); + Ranges.addStore(*Offset, NextStore); } else { MemSetInst *MSI = cast<MemSetInst>(BI); @@ -433,11 +350,11 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, break; // Check to see if this store is to a constant offset from the start ptr. - int64_t Offset; - if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, DL)) + Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL); + if (!Offset) break; - Ranges.addMemSet(Offset, MSI); + Ranges.addMemSet(*Offset, MSI); } } @@ -471,16 +388,12 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, StartPtr = Range.StartPtr; // Determine alignment - unsigned Alignment = Range.Alignment; - if (Alignment == 0) { - Type *EltType = - cast<PointerType>(StartPtr->getType())->getElementType(); - Alignment = DL.getABITypeAlignment(EltType); - } - - AMemSet = - Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + const Align Alignment = DL.getValueOrABITypeAlignment( + MaybeAlign(Range.Alignment), + cast<PointerType>(StartPtr->getType())->getElementType()); + AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start, + Alignment); LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI : Range.TheStores) dbgs() << *SI << '\n'; @@ -500,25 +413,21 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst, return AMemSet; } -static unsigned findStoreAlignment(const DataLayout &DL, const StoreInst *SI) { - unsigned StoreAlign = SI->getAlignment(); - if (!StoreAlign) - StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType()); - return StoreAlign; +static Align findStoreAlignment(const DataLayout &DL, const StoreInst *SI) { + return DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()), + SI->getOperand(0)->getType()); } -static unsigned findLoadAlignment(const DataLayout &DL, const LoadInst *LI) { - unsigned LoadAlign = LI->getAlignment(); - if (!LoadAlign) - LoadAlign = DL.getABITypeAlignment(LI->getType()); - return LoadAlign; +static Align findLoadAlignment(const DataLayout &DL, const LoadInst *LI) { + return DL.getValueOrABITypeAlignment(MaybeAlign(LI->getAlignment()), + LI->getType()); } -static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI, - const LoadInst *LI) { - unsigned StoreAlign = findStoreAlignment(DL, SI); - unsigned LoadAlign = findLoadAlignment(DL, LI); - return MinAlign(StoreAlign, LoadAlign); +static Align findCommonAlignment(const DataLayout &DL, const StoreInst *SI, + const LoadInst *LI) { + Align StoreAlign = findStoreAlignment(DL, SI); + Align LoadAlign = findLoadAlignment(DL, LI); + return commonAlignment(StoreAlign, LoadAlign); } // This method try to lift a store instruction before position P. @@ -733,7 +642,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { LI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()), - findCommonAlignment(DL, SI, LI), C); + findCommonAlignment(DL, SI, LI).value(), C); if (changed) { MD->removeInstruction(SI); SI->eraseFromParent(); @@ -766,12 +675,11 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { auto *T = V->getType(); if (T->isAggregateType()) { uint64_t Size = DL.getTypeStoreSize(T); - unsigned Align = SI->getAlignment(); - if (!Align) - Align = DL.getABITypeAlignment(T); + const Align MA = + DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()), T); IRBuilder<> Builder(SI); auto *M = - Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, Align); + Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, MA); LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n"); @@ -983,7 +891,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, // If the destination wasn't sufficiently aligned then increase its alignment. if (!isDestSufficientlyAligned) { assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!"); - cast<AllocaInst>(cpyDest)->setAlignment(srcAlign); + cast<AllocaInst>(cpyDest)->setAlignment(MaybeAlign(srcAlign)); } // Drop any cached information about the call, because we may have changed @@ -1066,12 +974,12 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // example we could be moving from movaps -> movq on x86. IRBuilder<> Builder(M); if (UseMemMove) - Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(), - MDep->getRawSource(), MDep->getSourceAlignment(), + Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), + MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); else - Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(), - MDep->getRawSource(), MDep->getSourceAlignment(), + Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), + MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); // Remove the instruction we're replacing. @@ -1141,7 +1049,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, Builder.CreateMemSet( Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest, SrcSize), - MemSet->getOperand(1), MemsetLen, Align); + MemSet->getOperand(1), MemsetLen, MaybeAlign(Align)); MD->removeInstruction(MemSet); MemSet->eraseFromParent(); @@ -1209,8 +1117,8 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, } IRBuilder<> Builder(MemCpy); - Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), - CopySize, MemCpy->getDestAlignment()); + Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1), CopySize, + MaybeAlign(MemCpy->getDestAlignment())); return true; } @@ -1237,7 +1145,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { M->getModule()->getDataLayout())) { IRBuilder<> Builder(M); Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(), - M->getDestAlignment(), false); + MaybeAlign(M->getDestAlignment()), false); MD->removeInstruction(M); M->eraseFromParent(); ++NumCpyToSet; @@ -1520,7 +1428,7 @@ bool MemCpyOptLegacyPass::runOnFunction(Function &F) { return false; auto *MD = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(); - auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto LookupAliasAnalysis = [this]() -> AliasAnalysis & { return getAnalysis<AAResultsWrapperPass>().getAAResults(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp index 3d047a193267..ce1e142101b8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -50,6 +50,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -897,7 +898,7 @@ public: bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; - const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); // MergeICmps does not need the DominatorTree, but we update it if it's // already available. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 30645f4400e3..6b0d0202d9bb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -14,9 +14,11 @@ // diamond (hammock) and merges them into a single load in the header. Similar // it sinks and merges two stores to the tail block (footer). The algorithm // iterates over the instructions of one side of the diamond and attempts to -// find a matching load/store on the other side. It hoists / sinks when it -// thinks it safe to do so. This optimization helps with eg. hiding load -// latencies, triggering if-conversion, and reducing static code size. +// find a matching load/store on the other side. New tail/footer block may be +// insterted if the tail/footer block has more predecessors (not only the two +// predecessors that are forming the diamond). It hoists / sinks when it thinks +// it safe to do so. This optimization helps with eg. hiding load latencies, +// triggering if-conversion, and reducing static code size. // // NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist. // @@ -81,6 +83,7 @@ #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" @@ -103,7 +106,9 @@ class MergedLoadStoreMotion { // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl. const int MagicCompileTimeControl = 250; + const bool SplitFooterBB; public: + MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {} bool run(Function &F, AliasAnalysis &AA); private: @@ -114,7 +119,9 @@ private: PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); bool isStoreSinkBarrierInRange(const Instruction &Start, const Instruction &End, MemoryLocation Loc); - bool sinkStore(BasicBlock *BB, StoreInst *SinkCand, StoreInst *ElseInst); + bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const; + void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand, + StoreInst *ElseInst); bool mergeStores(BasicBlock *BB); }; } // end anonymous namespace @@ -217,74 +224,82 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, } /// +/// Check if 2 stores can be sunk together with corresponding GEPs +/// +bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0, + StoreInst *S1) const { + auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); + auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); + return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && + (A0->getParent() == S0->getParent()) && A1->hasOneUse() && + (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0); +} + +/// /// Merge two stores to same address and sink into \p BB /// /// Also sinks GEP instruction computing the store address /// -bool MergedLoadStoreMotion::sinkStore(BasicBlock *BB, StoreInst *S0, - StoreInst *S1) { +void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0, + StoreInst *S1) { // Only one definition? auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand()); auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand()); - if (A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() && - (A0->getParent() == S0->getParent()) && A1->hasOneUse() && - (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0)) { - LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); - dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; - dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); - // Hoist the instruction. - BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); - // Intersect optional metadata. - S0->andIRFlags(S1); - S0->dropUnknownNonDebugMetadata(); - - // Create the new store to be inserted at the join point. - StoreInst *SNew = cast<StoreInst>(S0->clone()); - Instruction *ANew = A0->clone(); - SNew->insertBefore(&*InsertPt); - ANew->insertBefore(SNew); - - assert(S0->getParent() == A0->getParent()); - assert(S1->getParent() == A1->getParent()); - - // New PHI operand? Use it. - if (PHINode *NewPN = getPHIOperand(BB, S0, S1)) - SNew->setOperand(0, NewPN); - S0->eraseFromParent(); - S1->eraseFromParent(); - A0->replaceAllUsesWith(ANew); - A0->eraseFromParent(); - A1->replaceAllUsesWith(ANew); - A1->eraseFromParent(); - return true; - } - return false; + LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump(); + dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n"; + dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n"); + // Hoist the instruction. + BasicBlock::iterator InsertPt = BB->getFirstInsertionPt(); + // Intersect optional metadata. + S0->andIRFlags(S1); + S0->dropUnknownNonDebugMetadata(); + + // Create the new store to be inserted at the join point. + StoreInst *SNew = cast<StoreInst>(S0->clone()); + Instruction *ANew = A0->clone(); + SNew->insertBefore(&*InsertPt); + ANew->insertBefore(SNew); + + assert(S0->getParent() == A0->getParent()); + assert(S1->getParent() == A1->getParent()); + + // New PHI operand? Use it. + if (PHINode *NewPN = getPHIOperand(BB, S0, S1)) + SNew->setOperand(0, NewPN); + S0->eraseFromParent(); + S1->eraseFromParent(); + A0->replaceAllUsesWith(ANew); + A0->eraseFromParent(); + A1->replaceAllUsesWith(ANew); + A1->eraseFromParent(); } /// /// True when two stores are equivalent and can sink into the footer /// -/// Starting from a diamond tail block, iterate over the instructions in one -/// predecessor block and try to match a store in the second predecessor. +/// Starting from a diamond head block, iterate over the instructions in one +/// successor block and try to match a store in the second successor. /// -bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { +bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) { bool MergedStores = false; - assert(T && "Footer of a diamond cannot be empty"); - - pred_iterator PI = pred_begin(T), E = pred_end(T); - assert(PI != E); - BasicBlock *Pred0 = *PI; - ++PI; - BasicBlock *Pred1 = *PI; - ++PI; + BasicBlock *TailBB = getDiamondTail(HeadBB); + BasicBlock *SinkBB = TailBB; + assert(SinkBB && "Footer of a diamond cannot be empty"); + + succ_iterator SI = succ_begin(HeadBB); + assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors"); + BasicBlock *Pred0 = *SI; + ++SI; + assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor"); + BasicBlock *Pred1 = *SI; // tail block of a diamond/hammock? if (Pred0 == Pred1) return false; // No. - if (PI != E) - return false; // No. More than 2 predecessors. - - // #Instructions in Succ1 for Compile Time Control + // bail out early if we can not merge into the footer BB + if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3)) + return false; + // #Instructions in Pred1 for Compile Time Control auto InstsNoDbg = Pred1->instructionsWithoutDebug(); int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end()); int NStores = 0; @@ -304,14 +319,23 @@ bool MergedLoadStoreMotion::mergeStores(BasicBlock *T) { if (NStores * Size1 >= MagicCompileTimeControl) break; if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) { - bool Res = sinkStore(T, S0, S1); - MergedStores |= Res; - // Don't attempt to sink below stores that had to stick around - // But after removal of a store and some of its feeding - // instruction search again from the beginning since the iterator - // is likely stale at this point. - if (!Res) + if (!canSinkStoresAndGEPs(S0, S1)) + // Don't attempt to sink below stores that had to stick around + // But after removal of a store and some of its feeding + // instruction search again from the beginning since the iterator + // is likely stale at this point. break; + + if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) { + // We have more than 2 predecessors. Insert a new block + // postdominating 2 predecessors we're going to sink from. + SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split"); + if (!SinkBB) + break; + } + + MergedStores = true; + sinkStoresAndGEPs(SinkBB, S0, S1); RBI = Pred0->rbegin(); RBE = Pred0->rend(); LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump()); @@ -328,13 +352,15 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. + // This loop doesn't care about newly inserted/split blocks + // since they never will be diamond heads. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) { BasicBlock *BB = &*FI++; // Hoist equivalent loads and sink stores // outside diamonds when possible if (isDiamondHead(BB)) { - Changed |= mergeStores(getDiamondTail(BB)); + Changed |= mergeStores(BB); } } return Changed; @@ -342,9 +368,11 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) { namespace { class MergedLoadStoreMotionLegacyPass : public FunctionPass { + const bool SplitFooterBB; public: static char ID; // Pass identification, replacement for typeid - MergedLoadStoreMotionLegacyPass() : FunctionPass(ID) { + MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false) + : FunctionPass(ID), SplitFooterBB(SplitFooterBB) { initializeMergedLoadStoreMotionLegacyPassPass( *PassRegistry::getPassRegistry()); } @@ -355,13 +383,14 @@ public: bool runOnFunction(Function &F) override { if (skipFunction(F)) return false; - MergedLoadStoreMotion Impl; + MergedLoadStoreMotion Impl(SplitFooterBB); return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults()); } private: void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); + if (!SplitFooterBB) + AU.setPreservesCFG(); AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -373,8 +402,8 @@ char MergedLoadStoreMotionLegacyPass::ID = 0; /// /// createMergedLoadStoreMotionPass - The public interface to this file. /// -FunctionPass *llvm::createMergedLoadStoreMotionPass() { - return new MergedLoadStoreMotionLegacyPass(); +FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) { + return new MergedLoadStoreMotionLegacyPass(SplitFooterBB); } INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion", @@ -385,13 +414,14 @@ INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion", PreservedAnalyses MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { - MergedLoadStoreMotion Impl; + MergedLoadStoreMotion Impl(Options.SplitFooterBB); auto &AA = AM.getResult<AAManager>(F); if (!Impl.run(F, AA)) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserveSet<CFGAnalyses>(); + if (!Options.SplitFooterBB) + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); return PA; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index 94436b55752a..bba9082e31b2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -82,7 +82,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -101,10 +100,12 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> @@ -170,7 +171,7 @@ bool NaryReassociateLegacyPass::runOnFunction(Function &F) { auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); return Impl.runImpl(F, AC, DT, SE, TLI, TTI); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp index 08ac2b666fce..6a643480f312 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -76,7 +76,6 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -89,10 +88,12 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/ArrayRecycler.h" @@ -105,6 +106,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVNExpression.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PredicateInfo.h" #include "llvm/Transforms/Utils/VNCoercion.h" #include <algorithm> @@ -122,6 +124,7 @@ using namespace llvm; using namespace llvm::GVNExpression; using namespace llvm::VNCoercion; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "newgvn" @@ -487,11 +490,11 @@ namespace { class NewGVN { Function &F; - DominatorTree *DT; - const TargetLibraryInfo *TLI; - AliasAnalysis *AA; - MemorySSA *MSSA; - MemorySSAWalker *MSSAWalker; + DominatorTree *DT = nullptr; + const TargetLibraryInfo *TLI = nullptr; + AliasAnalysis *AA = nullptr; + MemorySSA *MSSA = nullptr; + MemorySSAWalker *MSSAWalker = nullptr; const DataLayout &DL; std::unique_ptr<PredicateInfo> PredInfo; @@ -503,7 +506,7 @@ class NewGVN { const SimplifyQuery SQ; // Number of function arguments, used by ranking - unsigned int NumFuncArgs; + unsigned int NumFuncArgs = 0; // RPOOrdering of basic blocks DenseMap<const DomTreeNode *, unsigned> RPOOrdering; @@ -514,9 +517,9 @@ class NewGVN { // startsout in, and represents any value. Being an optimistic analysis, // anything in the TOP class has the value TOP, which is indeterminate and // equivalent to everything. - CongruenceClass *TOPClass; + CongruenceClass *TOPClass = nullptr; std::vector<CongruenceClass *> CongruenceClasses; - unsigned NextCongruenceNum; + unsigned NextCongruenceNum = 0; // Value Mappings. DenseMap<Value *, CongruenceClass *> ValueToClass; @@ -656,7 +659,7 @@ public: TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA, const DataLayout &DL) : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL), - PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), + PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false) {} bool runGVN(); @@ -860,7 +863,7 @@ private: // Debug counter info. When verifying, we have to reset the value numbering // debug counter to the same state it started in to get the same results. - int64_t StartingVNCounter; + int64_t StartingVNCounter = 0; }; } // end anonymous namespace @@ -1332,7 +1335,7 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, E->setOpcode(0); E->op_push_back(PointerOp); if (LI) - E->setAlignment(LI->getAlignment()); + E->setAlignment(MaybeAlign(LI->getAlignment())); // TODO: Value number heap versions. We may be able to discover // things alias analysis can't on it's own (IE that a store and a @@ -1637,8 +1640,11 @@ const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const { if (AA->doesNotAccessMemory(CI)) { return createCallExpression(CI, TOPClass->getMemoryLeader()); } else if (AA->onlyReadsMemory(CI)) { - MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI); - return createCallExpression(CI, DefiningAccess); + if (auto *MA = MSSA->getMemoryAccess(CI)) { + auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA); + return createCallExpression(CI, DefiningAccess); + } else // MSSA determined that CI does not access memory. + return createCallExpression(CI, TOPClass->getMemoryLeader()); } return nullptr; } @@ -1754,7 +1760,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps, return true; }); // If we are left with no operands, it's dead. - if (empty(Filtered)) { + if (Filtered.empty()) { // If it has undef at this point, it means there are no-non-undef arguments, // and thus, the value of the phi node must be undef. if (HasUndef) { @@ -2464,9 +2470,9 @@ Value *NewGVN::findConditionEquivalence(Value *Cond) const { // Process the outgoing edges of a block for reachability. void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) { // Evaluate reachability of terminator instruction. - BranchInst *BR; - if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) { - Value *Cond = BR->getCondition(); + Value *Cond; + BasicBlock *TrueSucc, *FalseSucc; + if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) { Value *CondEvaluated = findConditionEquivalence(Cond); if (!CondEvaluated) { if (auto *I = dyn_cast<Instruction>(Cond)) { @@ -2479,8 +2485,6 @@ void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) { } } ConstantInt *CI; - BasicBlock *TrueSucc = BR->getSuccessor(0); - BasicBlock *FalseSucc = BR->getSuccessor(1); if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) { if (CI->isOne()) { LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI @@ -4196,7 +4200,7 @@ bool NewGVNLegacyPass::runOnFunction(Function &F) { return false; return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F), &getAnalysis<AAResultsWrapperPass>().getAAResults(), &getAnalysis<MemorySSAWrapperPass>().getMSSA(), F.getParent()->getDataLayout()) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 039123218544..58763ec72ece 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -161,7 +162,7 @@ public: return false; TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); return runPartiallyInlineLibCalls(F, TLI, TTI); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp index b544f0a39ea8..5c4a89977c38 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp @@ -47,6 +47,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/ADT/SetVector.h" @@ -131,7 +132,7 @@ struct PlaceBackedgeSafepointsImpl : public FunctionPass { SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); for (Loop *I : *LI) { runOnLoopAndSubLoops(I); } @@ -240,7 +241,7 @@ static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header, static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE, BasicBlock *Pred) { // A conservative bound on the loop as a whole. - const SCEV *MaxTrips = SE->getMaxBackedgeTakenCount(L); + const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L); if (MaxTrips != SE->getCouldNotCompute() && SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN( CountedLoopTripWidth)) @@ -478,7 +479,7 @@ bool PlaceSafepoints::runOnFunction(Function &F) { return false; const TargetLibraryInfo &TLI = - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); bool Modified = false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp index fa8c9e2a5fe4..41940e980faa 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -30,7 +30,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Argument.h" #include "llvm/IR/BasicBlock.h" @@ -50,12 +49,14 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <algorithm> #include <cassert> #include <utility> @@ -173,7 +174,7 @@ void ReassociatePass::BuildRankMap(Function &F, << "\n"); } - // Traverse basic blocks in ReversePostOrder + // Traverse basic blocks in ReversePostOrder. for (BasicBlock *BB : RPOT) { unsigned BBRank = RankMap[BB] = ++Rank << 16; @@ -861,7 +862,7 @@ static Value *NegateValue(Value *V, Instruction *BI, // this use. We do this by moving it to the entry block (if it is a // non-instruction value) or right after the definition. These negates will // be zapped by reassociate later, so we don't need much finesse here. - BinaryOperator *TheNeg = cast<BinaryOperator>(U); + Instruction *TheNeg = cast<Instruction>(U); // Verify that the negate is in this function, V might be a constant expr. if (TheNeg->getParent()->getParent() != BI->getParent()->getParent()) @@ -1898,6 +1899,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I, ValueRankMap.erase(I); Insts.remove(I); RedoInsts.remove(I); + llvm::salvageDebugInfoOrMarkUndef(*I); I->eraseFromParent(); for (auto Op : Ops) if (Instruction *OpInst = dyn_cast<Instruction>(Op)) @@ -1914,6 +1916,7 @@ void ReassociatePass::EraseInst(Instruction *I) { // Erase the dead instruction. ValueRankMap.erase(I); RedoInsts.remove(I); + llvm::salvageDebugInfoOrMarkUndef(*I); I->eraseFromParent(); // Optimize its operands. SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes. @@ -1938,88 +1941,132 @@ void ReassociatePass::EraseInst(Instruction *I) { MadeChange = true; } -// Canonicalize expressions of the following form: -// x + (-Constant * y) -> x - (Constant * y) -// x - (-Constant * y) -> x + (Constant * y) -Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) { - if (!I->hasOneUse() || I->getType()->isVectorTy()) - return nullptr; - - // Must be a fmul or fdiv instruction. - unsigned Opcode = I->getOpcode(); - if (Opcode != Instruction::FMul && Opcode != Instruction::FDiv) - return nullptr; - - auto *C0 = dyn_cast<ConstantFP>(I->getOperand(0)); - auto *C1 = dyn_cast<ConstantFP>(I->getOperand(1)); - - // Both operands are constant, let it get constant folded away. - if (C0 && C1) - return nullptr; - - ConstantFP *CF = C0 ? C0 : C1; - - // Must have one constant operand. - if (!CF) - return nullptr; +/// Recursively analyze an expression to build a list of instructions that have +/// negative floating-point constant operands. The caller can then transform +/// the list to create positive constants for better reassociation and CSE. +static void getNegatibleInsts(Value *V, + SmallVectorImpl<Instruction *> &Candidates) { + // Handle only one-use instructions. Combining negations does not justify + // replicating instructions. + Instruction *I; + if (!match(V, m_OneUse(m_Instruction(I)))) + return; - // Must be a negative ConstantFP. - if (!CF->isNegative()) - return nullptr; + // Handle expressions of multiplications and divisions. + // TODO: This could look through floating-point casts. + const APFloat *C; + switch (I->getOpcode()) { + case Instruction::FMul: + // Not expecting non-canonical code here. Bail out and wait. + if (match(I->getOperand(0), m_Constant())) + break; - // User must be a binary operator with one or more uses. - Instruction *User = I->user_back(); - if (!isa<BinaryOperator>(User) || User->use_empty()) - return nullptr; + if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) { + Candidates.push_back(I); + LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n'); + } + getNegatibleInsts(I->getOperand(0), Candidates); + getNegatibleInsts(I->getOperand(1), Candidates); + break; + case Instruction::FDiv: + // Not expecting non-canonical code here. Bail out and wait. + if (match(I->getOperand(0), m_Constant()) && + match(I->getOperand(1), m_Constant())) + break; - unsigned UserOpcode = User->getOpcode(); - if (UserOpcode != Instruction::FAdd && UserOpcode != Instruction::FSub) - return nullptr; + if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) || + (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) { + Candidates.push_back(I); + LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n'); + } + getNegatibleInsts(I->getOperand(0), Candidates); + getNegatibleInsts(I->getOperand(1), Candidates); + break; + default: + break; + } +} - // Subtraction is not commutative. Explicitly, the following transform is - // not valid: (-Constant * y) - x -> x + (Constant * y) - if (!User->isCommutative() && User->getOperand(1) != I) +/// Given an fadd/fsub with an operand that is a one-use instruction +/// (the fadd/fsub), try to change negative floating-point constants into +/// positive constants to increase potential for reassociation and CSE. +Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I, + Instruction *Op, + Value *OtherOp) { + assert((I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub"); + + // Collect instructions with negative FP constants from the subtree that ends + // in Op. + SmallVector<Instruction *, 4> Candidates; + getNegatibleInsts(Op, Candidates); + if (Candidates.empty()) return nullptr; // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the // resulting subtract will be broken up later. This can get us into an // infinite loop during reassociation. - if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User)) + bool IsFSub = I->getOpcode() == Instruction::FSub; + bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1; + if (NeedsSubtract && ShouldBreakUpSubtract(I)) return nullptr; - // Change the sign of the constant. - APFloat Val = CF->getValueAPF(); - Val.changeSign(); - I->setOperand(C0 ? 0 : 1, ConstantFP::get(CF->getContext(), Val)); - - // Canonicalize I to RHS to simplify the next bit of logic. E.g., - // ((-Const*y) + x) -> (x + (-Const*y)). - if (User->getOperand(0) == I && User->isCommutative()) - cast<BinaryOperator>(User)->swapOperands(); - - Value *Op0 = User->getOperand(0); - Value *Op1 = User->getOperand(1); - BinaryOperator *NI; - switch (UserOpcode) { - default: - llvm_unreachable("Unexpected Opcode!"); - case Instruction::FAdd: - NI = BinaryOperator::CreateFSub(Op0, Op1); - NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags()); - break; - case Instruction::FSub: - NI = BinaryOperator::CreateFAdd(Op0, Op1); - NI->setFastMathFlags(cast<FPMathOperator>(User)->getFastMathFlags()); - break; + for (Instruction *Negatible : Candidates) { + const APFloat *C; + if (match(Negatible->getOperand(0), m_APFloat(C))) { + assert(!match(Negatible->getOperand(1), m_Constant()) && + "Expecting only 1 constant operand"); + assert(C->isNegative() && "Expected negative FP constant"); + Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C))); + MadeChange = true; + } + if (match(Negatible->getOperand(1), m_APFloat(C))) { + assert(!match(Negatible->getOperand(0), m_Constant()) && + "Expecting only 1 constant operand"); + assert(C->isNegative() && "Expected negative FP constant"); + Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C))); + MadeChange = true; + } } + assert(MadeChange == true && "Negative constant candidate was not changed"); - NI->insertBefore(User); - NI->setName(User->getName()); - User->replaceAllUsesWith(NI); - NI->setDebugLoc(I->getDebugLoc()); + // Negations cancelled out. + if (Candidates.size() % 2 == 0) + return I; + + // Negate the final operand in the expression by flipping the opcode of this + // fadd/fsub. + assert(Candidates.size() % 2 == 1 && "Expected odd number"); + IRBuilder<> Builder(I); + Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I) + : Builder.CreateFSubFMF(OtherOp, Op, I); + I->replaceAllUsesWith(NewInst); RedoInsts.insert(I); - MadeChange = true; - return NI; + return dyn_cast<Instruction>(NewInst); +} + +/// Canonicalize expressions that contain a negative floating-point constant +/// of the following form: +/// OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree) +/// (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree) +/// OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree) +/// +/// The fadd/fsub opcode may be switched to allow folding a negation into the +/// input instruction. +Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) { + LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n'); + Value *X; + Instruction *Op; + if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op))))) + if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) + I = R; + if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X)))) + if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) + I = R; + if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op))))) + if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X)) + I = R; + return I; } /// Inspect and optimize the given instruction. Note that erasing @@ -2042,16 +2089,16 @@ void ReassociatePass::OptimizeInst(Instruction *I) { I = NI; } - // Canonicalize negative constants out of expressions. - if (Instruction *Res = canonicalizeNegConstExpr(I)) - I = Res; - // Commute binary operators, to canonicalize the order of their operands. // This can potentially expose more CSE opportunities, and makes writing other // transformations simpler. if (I->isCommutative()) canonicalizeOperands(I); + // Canonicalize negative constants out of expressions. + if (Instruction *Res = canonicalizeNegFPConstants(I)) + I = Res; + // Don't optimize floating-point instructions unless they are 'fast'. if (I->getType()->isFPOrFPVectorTy() && !I->isFast()) return; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 3296322e00d5..0716c1320982 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -16,16 +16,17 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/Statistic.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index c358258d24cf..b242f100faff 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -54,6 +54,7 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -172,8 +173,6 @@ public: bool runOnModule(Module &M) override { bool Changed = false; - const TargetLibraryInfo &TLI = - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); for (Function &F : M) { // Nothing to do for declarations. if (F.isDeclaration() || F.empty()) @@ -186,6 +185,8 @@ public: TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree(); Changed |= Impl.runOnFunction(F, DT, TTI, TLI); @@ -2530,7 +2531,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT, // statepoints surviving this pass. This makes testing easier and the // resulting IR less confusing to human readers. DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - bool MadeChange = removeUnreachableBlocks(F, nullptr, &DTU); + bool MadeChange = removeUnreachableBlocks(F, &DTU); // Flush the Dominator Tree. DTU.getDomTree(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp index 4093e50ce899..e696ea83a300 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -29,7 +29,6 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/IR/BasicBlock.h" @@ -49,12 +48,14 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PredicateInfo.h" #include <cassert> #include <utility> @@ -191,7 +192,7 @@ public: /// class SCCPSolver : public InstVisitor<SCCPSolver> { const DataLayout &DL; - const TargetLibraryInfo *TLI; + std::function<const TargetLibraryInfo &(Function &)> GetTLI; SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable. DenseMap<Value *, LatticeVal> ValueState; // The state each value is in. // The state each parameter is in. @@ -268,8 +269,9 @@ public: return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy}; } - SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli) - : DL(DL), TLI(tli) {} + SCCPSolver(const DataLayout &DL, + std::function<const TargetLibraryInfo &(Function &)> GetTLI) + : DL(DL), GetTLI(std::move(GetTLI)) {} /// MarkBlockExecutable - This method can be used by clients to mark all of /// the blocks that are known to be intrinsically live in the processed unit. @@ -1290,7 +1292,7 @@ CallOverdefined: // If we can constant fold this, mark the result of the call as a // constant. if (Constant *C = ConstantFoldCall(cast<CallBase>(CS.getInstruction()), F, - Operands, TLI)) { + Operands, &GetTLI(*F))) { // call -> undef. if (isa<UndefValue>(C)) return; @@ -1465,7 +1467,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { } LatticeVal &LV = getValueState(&I); - if (!LV.isUnknown()) continue; + if (!LV.isUnknown()) + continue; + + // There are two reasons a call can have an undef result + // 1. It could be tracked. + // 2. It could be constant-foldable. + // Because of the way we solve return values, tracked calls must + // never be marked overdefined in ResolvedUndefsIn. + if (CallSite CS = CallSite(&I)) { + if (Function *F = CS.getCalledFunction()) + if (TrackedRetVals.count(F)) + continue; + + // If the call is constant-foldable, we mark it overdefined because + // we do not know what return values are valid. + markOverdefined(&I); + return true; + } // extractvalue is safe; check here because the argument is a struct. if (isa<ExtractValueInst>(I)) @@ -1638,19 +1657,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { case Instruction::Call: case Instruction::Invoke: case Instruction::CallBr: - // There are two reasons a call can have an undef result - // 1. It could be tracked. - // 2. It could be constant-foldable. - // Because of the way we solve return values, tracked calls must - // never be marked overdefined in ResolvedUndefsIn. - if (Function *F = CallSite(&I).getCalledFunction()) - if (TrackedRetVals.count(F)) - break; - - // If the call is constant-foldable, we mark it overdefined because - // we do not know what return values are valid. - markOverdefined(&I); - return true; + llvm_unreachable("Call-like instructions should have be handled early"); default: // If we don't know what should happen here, conservatively mark it // overdefined. @@ -1751,7 +1758,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { [](const LatticeVal &LV) { return LV.isOverdefined(); })) return false; std::vector<Constant *> ConstVals; - auto *ST = dyn_cast<StructType>(V->getType()); + auto *ST = cast<StructType>(V->getType()); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { LatticeVal V = IVs[i]; ConstVals.push_back(V.isConstant() @@ -1796,7 +1803,8 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) { static bool runSCCP(Function &F, const DataLayout &DL, const TargetLibraryInfo *TLI) { LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n"); - SCCPSolver Solver(DL, TLI); + SCCPSolver Solver( + DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; }); // Mark the first block of the function as being executable. Solver.MarkBlockExecutable(&F.front()); @@ -1891,7 +1899,7 @@ public: return false; const DataLayout &DL = F.getParent()->getDataLayout(); const TargetLibraryInfo *TLI = - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); return runSCCP(F, DL, TLI); } }; @@ -1924,6 +1932,27 @@ static void findReturnsToZap(Function &F, return; } + assert( + all_of(F.users(), + [&Solver](User *U) { + if (isa<Instruction>(U) && + !Solver.isBlockExecutable(cast<Instruction>(U)->getParent())) + return true; + // Non-callsite uses are not impacted by zapping. Also, constant + // uses (like blockaddresses) could stuck around, without being + // used in the underlying IR, meaning we do not have lattice + // values for them. + if (!CallSite(U)) + return true; + if (U->getType()->isStructTy()) { + return all_of( + Solver.getStructLatticeValueFor(U), + [](const LatticeVal &LV) { return !LV.isOverdefined(); }); + } + return !Solver.getLatticeValueFor(U).isOverdefined(); + }) && + "We can only zap functions where all live users have a concrete value"); + for (BasicBlock &BB : F) { if (CallInst *CI = BB.getTerminatingMustTailCall()) { LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present " @@ -1974,9 +2003,10 @@ static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) { } bool llvm::runIPSCCP( - Module &M, const DataLayout &DL, const TargetLibraryInfo *TLI, + Module &M, const DataLayout &DL, + std::function<const TargetLibraryInfo &(Function &)> GetTLI, function_ref<AnalysisResultsForFn(Function &)> getAnalysis) { - SCCPSolver Solver(DL, TLI); + SCCPSolver Solver(DL, GetTLI); // Loop over all functions, marking arguments to those with their addresses // taken or that are external as overdefined. @@ -2167,7 +2197,7 @@ bool llvm::runIPSCCP( findReturnsToZap(*F, ReturnsToZap, Solver); } - for (const auto &F : Solver.getMRVFunctionsTracked()) { + for (auto F : Solver.getMRVFunctionsTracked()) { assert(F->getReturnType()->isStructTy() && "The return type should be a struct"); StructType *STy = cast<StructType>(F->getReturnType()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp index bd4c21d65abc..89916e43fce2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp @@ -41,7 +41,6 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/PtrUseVisitor.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -71,6 +70,7 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -80,6 +80,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include <algorithm> #include <cassert> @@ -361,7 +362,7 @@ private: /// The beginning and ending offsets of the alloca for this /// partition. - uint64_t BeginOffset, EndOffset; + uint64_t BeginOffset = 0, EndOffset = 0; /// The start and end iterators of this partition. iterator SI, SJ; @@ -959,14 +960,16 @@ private: std::tie(UsedI, I) = Uses.pop_back_val(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - Size = std::max(Size, DL.getTypeStoreSize(LI->getType())); + Size = std::max(Size, + DL.getTypeStoreSize(LI->getType()).getFixedSize()); continue; } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { Value *Op = SI->getOperand(0); if (Op == UsedI) return SI; - Size = std::max(Size, DL.getTypeStoreSize(Op->getType())); + Size = std::max(Size, + DL.getTypeStoreSize(Op->getType()).getFixedSize()); continue; } @@ -1197,7 +1200,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) { // TODO: Allow recursive phi users. // TODO: Allow stores. BasicBlock *BB = PN.getParent(); - unsigned MaxAlign = 0; + MaybeAlign MaxAlign; uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType()); APInt MaxSize(APWidth, 0); bool HaveLoad = false; @@ -1218,8 +1221,8 @@ static bool isSafePHIToSpeculate(PHINode &PN) { if (BBI->mayWriteToMemory()) return false; - uint64_t Size = DL.getTypeStoreSizeInBits(LI->getType()); - MaxAlign = std::max(MaxAlign, LI->getAlignment()); + uint64_t Size = DL.getTypeStoreSize(LI->getType()); + MaxAlign = std::max(MaxAlign, MaybeAlign(LI->getAlignment())); MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize; HaveLoad = true; } @@ -1266,11 +1269,11 @@ static void speculatePHINodeLoads(PHINode &PN) { PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(), PN.getName() + ".sroa.speculated"); - // Get the AA tags and alignment to use from one of the loads. It doesn't + // Get the AA tags and alignment to use from one of the loads. It does not // matter which one we get and if any differ. AAMDNodes AATags; SomeLoad->getAAMetadata(AATags); - unsigned Align = SomeLoad->getAlignment(); + const MaybeAlign Align = MaybeAlign(SomeLoad->getAlignment()); // Rewrite all loads of the PN to use the new PHI. while (!PN.use_empty()) { @@ -1338,11 +1341,11 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { // Both operands to the select need to be dereferenceable, either // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. - if (!isSafeToLoadUnconditionally(TValue, LI->getType(), LI->getAlignment(), - DL, LI)) + if (!isSafeToLoadUnconditionally(TValue, LI->getType(), + MaybeAlign(LI->getAlignment()), DL, LI)) return false; - if (!isSafeToLoadUnconditionally(FValue, LI->getType(), LI->getAlignment(), - DL, LI)) + if (!isSafeToLoadUnconditionally(FValue, LI->getType(), + MaybeAlign(LI->getAlignment()), DL, LI)) return false; } @@ -1368,8 +1371,8 @@ static void speculateSelectInstLoads(SelectInst &SI) { NumLoadsSpeculated += 2; // Transfer alignment and AA info if present. - TL->setAlignment(LI->getAlignment()); - FL->setAlignment(LI->getAlignment()); + TL->setAlignment(MaybeAlign(LI->getAlignment())); + FL->setAlignment(MaybeAlign(LI->getAlignment())); AAMDNodes Tags; LI->getAAMetadata(Tags); @@ -1678,24 +1681,20 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, } /// Compute the adjusted alignment for a load or store from an offset. -static unsigned getAdjustedAlignment(Instruction *I, uint64_t Offset, - const DataLayout &DL) { - unsigned Alignment; +static Align getAdjustedAlignment(Instruction *I, uint64_t Offset, + const DataLayout &DL) { + MaybeAlign Alignment; Type *Ty; if (auto *LI = dyn_cast<LoadInst>(I)) { - Alignment = LI->getAlignment(); + Alignment = MaybeAlign(LI->getAlignment()); Ty = LI->getType(); } else if (auto *SI = dyn_cast<StoreInst>(I)) { - Alignment = SI->getAlignment(); + Alignment = MaybeAlign(SI->getAlignment()); Ty = SI->getValueOperand()->getType(); } else { llvm_unreachable("Only loads and stores are allowed!"); } - - if (!Alignment) - Alignment = DL.getABITypeAlignment(Ty); - - return MinAlign(Alignment, Offset); + return commonAlignment(DL.getValueOrABITypeAlignment(Alignment, Ty), Offset); } /// Test whether we can convert a value from the old to the new type. @@ -2298,9 +2297,9 @@ class llvm::sroa::AllocaSliceRewriter // The new offsets of the slice currently being rewritten relative to the // original alloca. - uint64_t NewBeginOffset, NewEndOffset; + uint64_t NewBeginOffset = 0, NewEndOffset = 0; - uint64_t SliceSize; + uint64_t SliceSize = 0; bool IsSplittable = false; bool IsSplit = false; Use *OldUse = nullptr; @@ -2430,13 +2429,14 @@ private: /// /// You can optionally pass a type to this routine and if that type's ABI /// alignment is itself suitable, this will return zero. - unsigned getSliceAlign(Type *Ty = nullptr) { - unsigned NewAIAlign = NewAI.getAlignment(); - if (!NewAIAlign) - NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType()); - unsigned Align = - MinAlign(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); - return (Ty && Align == DL.getABITypeAlignment(Ty)) ? 0 : Align; + MaybeAlign getSliceAlign(Type *Ty = nullptr) { + const MaybeAlign NewAIAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(NewAI.getAlignment()), NewAI.getAllocatedType()); + const MaybeAlign Align = + commonAlignment(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset); + return (Ty && Align && Align->value() == DL.getABITypeAlignment(Ty)) + ? None + : Align; } unsigned getIndex(uint64_t Offset) { @@ -2798,7 +2798,7 @@ private: Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); CallInst *New = IRB.CreateMemSet( getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, - getSliceAlign(), II.isVolatile()); + MaybeAlign(getSliceAlign()), II.isVolatile()); if (AATags) New->setAAMetadata(AATags); LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); @@ -2884,7 +2884,7 @@ private: assert((IsDest && II.getRawDest() == OldPtr) || (!IsDest && II.getRawSource() == OldPtr)); - unsigned SliceAlign = getSliceAlign(); + MaybeAlign SliceAlign = getSliceAlign(); // For unsplit intrinsics, we simply modify the source and destination // pointers in place. This isn't just an optimization, it is a matter of @@ -2954,10 +2954,10 @@ private: // Compute the relative offset for the other pointer within the transfer. unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS); APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset); - unsigned OtherAlign = - IsDest ? II.getSourceAlignment() : II.getDestAlignment(); - OtherAlign = MinAlign(OtherAlign ? OtherAlign : 1, - OtherOffset.zextOrTrunc(64).getZExtValue()); + Align OtherAlign = + assumeAligned(IsDest ? II.getSourceAlignment() : II.getDestAlignment()); + OtherAlign = + commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue()); if (EmitMemCpy) { // Compute the other pointer, folding as much as possible to produce @@ -2970,7 +2970,7 @@ private: Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); Value *DestPtr, *SrcPtr; - unsigned DestAlign, SrcAlign; + MaybeAlign DestAlign, SrcAlign; // Note: IsDest is true iff we're copying into the new alloca slice if (IsDest) { DestPtr = OurPtr; @@ -3017,9 +3017,9 @@ private: Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, OtherPtr->getName() + "."); - unsigned SrcAlign = OtherAlign; + MaybeAlign SrcAlign = OtherAlign; Value *DstPtr = &NewAI; - unsigned DstAlign = SliceAlign; + MaybeAlign DstAlign = SliceAlign; if (!IsDest) { std::swap(SrcPtr, DstPtr); std::swap(SrcAlign, DstAlign); @@ -3115,20 +3115,17 @@ private: Instruction *I = Uses.pop_back_val(); if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - unsigned LoadAlign = LI->getAlignment(); - if (!LoadAlign) - LoadAlign = DL.getABITypeAlignment(LI->getType()); + MaybeAlign LoadAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(LI->getAlignment()), LI->getType()); LI->setAlignment(std::min(LoadAlign, getSliceAlign())); continue; } if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - unsigned StoreAlign = SI->getAlignment(); - if (!StoreAlign) { Value *Op = SI->getOperand(0); - StoreAlign = DL.getABITypeAlignment(Op->getType()); - } - SI->setAlignment(std::min(StoreAlign, getSliceAlign())); - continue; + MaybeAlign StoreAlign = DL.getValueOrABITypeAlignment( + MaybeAlign(SI->getAlignment()), Op->getType()); + SI->setAlignment(std::min(StoreAlign, getSliceAlign())); + continue; } assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) || @@ -3220,7 +3217,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { /// The current pointer use being rewritten. This is used to dig up the used /// value (as opposed to the user). - Use *U; + Use *U = nullptr; /// Used to calculate offsets, and hence alignment, of subobjects. const DataLayout &DL; @@ -3275,7 +3272,7 @@ private: Type *BaseTy; /// Known alignment of the base pointer. - unsigned BaseAlign; + Align BaseAlign; /// To calculate offset of each component so we can correctly deduce /// alignments. @@ -3284,7 +3281,7 @@ private: /// Initialize the splitter with an insertion point, Ptr and start with a /// single zero GEP index. OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - unsigned BaseAlign, const DataLayout &DL) + Align BaseAlign, const DataLayout &DL) : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {} @@ -3306,7 +3303,7 @@ private: if (Ty->isSingleValueType()) { unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices); return static_cast<Derived *>(this)->emitFunc( - Ty, Agg, MinAlign(BaseAlign, Offset), Name); + Ty, Agg, commonAlignment(BaseAlign, Offset), Name); } if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { @@ -3347,18 +3344,20 @@ private: AAMDNodes AATags; LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL) + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, - DL), AATags(AATags) {} + DL), + AATags(AATags) {} /// Emit a leaf load of a single value. This is called at the leaves of the /// recursive emission to actually load values. - void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) { + void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { assert(Ty->isSingleValueType()); // Load the single value and insert it using the indices. Value *GEP = IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); - LoadInst *Load = IRB.CreateAlignedLoad(Ty, GEP, Align, Name + ".load"); + LoadInst *Load = + IRB.CreateAlignedLoad(Ty, GEP, Alignment.value(), Name + ".load"); if (AATags) Load->setAAMetadata(AATags); Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); @@ -3386,14 +3385,14 @@ private: struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, - AAMDNodes AATags, unsigned BaseAlign, const DataLayout &DL) + AAMDNodes AATags, Align BaseAlign, const DataLayout &DL) : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL), AATags(AATags) {} AAMDNodes AATags; /// Emit a leaf store of a single value. This is called at the leaves of the /// recursive emission to actually produce stores. - void emitFunc(Type *Ty, Value *&Agg, unsigned Align, const Twine &Name) { + void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { assert(Ty->isSingleValueType()); // Extract the single value and store it using the indices. // @@ -3404,7 +3403,7 @@ private: Value *InBoundsGEP = IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); StoreInst *Store = - IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Align); + IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment.value()); if (AATags) Store->setAAMetadata(AATags); LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); @@ -3861,8 +3860,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, BasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), PartPtrTy, BasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, - LI->getName()); + getAdjustedAlignment(LI, PartOffset, DL).value(), + /*IsVolatile*/ false, LI->getName()); PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); @@ -3919,7 +3918,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), PartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL).value(), + /*IsVolatile*/ false); PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); @@ -4003,8 +4003,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, LoadBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), LoadPartPtrTy, LoadBasePtr->getName() + "."), - getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, - LI->getName()); + getAdjustedAlignment(LI, PartOffset, DL).value(), + /*IsVolatile*/ false, LI->getName()); } // And store this partition. @@ -4015,7 +4015,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { getAdjustedPtr(IRB, DL, StoreBasePtr, APInt(DL.getIndexSizeInBits(AS), PartOffset), StorePartPtrTy, StoreBasePtr->getName() + "."), - getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); + getAdjustedAlignment(SI, PartOffset, DL).value(), + /*IsVolatile*/ false); // Now build a new slice for the alloca. NewSlices.push_back( @@ -4150,20 +4151,19 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // FIXME: We might want to defer PHI speculation until after here. // FIXME: return nullptr; } else { - unsigned Alignment = AI.getAlignment(); - if (!Alignment) { - // The minimum alignment which users can rely on when the explicit - // alignment is omitted or zero is that required by the ABI for this - // type. - Alignment = DL.getABITypeAlignment(AI.getAllocatedType()); - } - Alignment = MinAlign(Alignment, P.beginOffset()); + // If alignment is unspecified we fallback on the one required by the ABI + // for this type. We also make sure the alignment is compatible with + // P.beginOffset(). + const Align Alignment = commonAlignment( + DL.getValueOrABITypeAlignment(MaybeAlign(AI.getAlignment()), + AI.getAllocatedType()), + P.beginOffset()); // If we will get at least this much alignment from the type alone, leave // the alloca's alignment unconstrained. - if (Alignment <= DL.getABITypeAlignment(SliceTy)) - Alignment = 0; + const bool IsUnconstrained = Alignment <= DL.getABITypeAlignment(SliceTy); NewAI = new AllocaInst( - SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment, + SliceTy, AI.getType()->getAddressSpace(), nullptr, + IsUnconstrained ? MaybeAlign() : Alignment, AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); // Copy the old AI debug location over to the new one. NewAI->setDebugLoc(AI.getDebugLoc()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp index 869cf00e0a89..9d088547b436 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -79,8 +79,10 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopVersioningLICMPass(Registry); initializeLoopIdiomRecognizeLegacyPassPass(Registry); initializeLowerAtomicLegacyPassPass(Registry); + initializeLowerConstantIntrinsicsPass(Registry); initializeLowerExpectIntrinsicPass(Registry); initializeLowerGuardIntrinsicLegacyPassPass(Registry); + initializeLowerMatrixIntrinsicsLegacyPassPass(Registry); initializeLowerWidenableConditionLegacyPassPass(Registry); initializeMemCpyOptLegacyPassPass(Registry); initializeMergeICmpsLegacyPassPass(Registry); @@ -88,6 +90,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeNaryReassociateLegacyPassPass(Registry); initializePartiallyInlineLibCallsLegacyPassPass(Registry); initializeReassociateLegacyPassPass(Registry); + initializeRedundantDbgInstEliminationPass(Registry); initializeRegToMemPass(Registry); initializeRewriteStatepointsForGCLegacyPassPass(Registry); initializeSCCPLegacyPassPass(Registry); @@ -123,6 +126,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAggressiveDCEPass()); } +void LLVMAddDCEPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createDeadCodeEliminationPass()); +} + void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBitTrackingDCEPass()); } @@ -280,6 +287,10 @@ void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createBasicAAWrapperPass()); } +void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLowerConstantIntrinsicsPass()); +} + void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLowerExpectIntrinsicPass()); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 2ee1a3a95f2a..c25c6c632b8f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" @@ -21,6 +22,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -33,12 +35,12 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Support/Options.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Scalar/Scalarizer.h" #include <cassert> #include <cstdint> #include <iterator> @@ -173,8 +175,8 @@ struct VectorLayout { class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> { public: - ScalarizerVisitor(unsigned ParallelLoopAccessMDKind) - : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind) { + ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT) + : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) { } bool visit(Function &F); @@ -214,6 +216,8 @@ private: GatherList Gathered; unsigned ParallelLoopAccessMDKind; + + DominatorTree *DT; }; class ScalarizerLegacyPass : public FunctionPass { @@ -225,6 +229,11 @@ public: } bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage& AU) const override { + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + } }; } // end anonymous namespace @@ -232,6 +241,7 @@ public: char ScalarizerLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) @@ -303,7 +313,8 @@ bool ScalarizerLegacyPass::runOnFunction(Function &F) { Module &M = *F.getParent(); unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); - ScalarizerVisitor Impl(ParallelLoopAccessMDKind); + DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); return Impl.visit(F); } @@ -340,6 +351,15 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) { return Scatterer(BB, BB->begin(), V, &Scattered[V]); } if (Instruction *VOp = dyn_cast<Instruction>(V)) { + // When scalarizing PHI nodes we might try to examine/rewrite InsertElement + // nodes in predecessors. If those predecessors are unreachable from entry, + // then the IR in those blocks could have unexpected properties resulting in + // infinite loops in Scatterer::operator[]. By simply treating values + // originating from instructions in unreachable blocks as undef we do not + // need to analyse them further. + if (!DT->isReachableFromEntry(VOp->getParent())) + return Scatterer(Point->getParent(), Point->getIterator(), + UndefValue::get(V->getType())); // Put the scattered form of an instruction directly after the // instruction. BasicBlock *BB = VOp->getParent(); @@ -856,7 +876,10 @@ PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) Module &M = *F.getParent(); unsigned ParallelLoopAccessMDKind = M.getContext().getMDKindID("llvm.mem.parallel_loop_access"); - ScalarizerVisitor Impl(ParallelLoopAccessMDKind); + DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); + ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT); bool Changed = Impl.visit(F); - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + return Changed ? PA : PreservedAnalyses::all(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index f6a12fb13142..2a1a040bf83e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -164,7 +164,6 @@ #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" @@ -182,6 +181,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -189,6 +189,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <string> @@ -1121,7 +1122,7 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) { DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); bool Changed = false; for (BasicBlock &B : F) { for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index aeac6f548b32..d7a34acb4318 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -38,8 +38,10 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTree.h" @@ -263,7 +265,7 @@ static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, /// to an entirely separate nest. static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, DominatorTree &DT, LoopInfo &LI, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, ScalarEvolution *SE) { // If the loop is already at the top level, we can't hoist it anywhere. Loop *OldParentL = L.getParentLoop(); if (!OldParentL) @@ -317,7 +319,7 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, // Because we just hoisted a loop out of this one, we have essentially // created new exit paths from it. That means we need to form LCSSA PHI // nodes for values used in the no-longer-nested loop. - formLCSSA(*OldContainingL, DT, &LI, nullptr); + formLCSSA(*OldContainingL, DT, &LI, SE); // We shouldn't need to form dedicated exits because the exit introduced // here is the (just split by unswitching) preheader. However, after trivial @@ -329,6 +331,20 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader, } } +// Return the top-most loop containing ExitBB and having ExitBB as exiting block +// or the loop containing ExitBB, if there is no parent loop containing ExitBB +// as exiting block. +static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) { + Loop *TopMost = LI.getLoopFor(ExitBB); + Loop *Current = TopMost; + while (Current) { + if (Current->isLoopExiting(ExitBB)) + TopMost = Current; + Current = Current->getParentLoop(); + } + return TopMost; +} + /// Unswitch a trivial branch if the condition is loop invariant. /// /// This routine should only be called when loop code leading to the branch has @@ -413,9 +429,10 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, }); // If we have scalar evolutions, we need to invalidate them including this - // loop and the loop containing the exit block. + // loop, the loop containing the exit block and the topmost parent loop + // exiting via LoopExitBB. if (SE) { - if (Loop *ExitL = LI.getLoopFor(LoopExitBB)) + if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI)) SE->forgetLoop(ExitL); else // Forget the entire nest as this exits the entire nest. @@ -532,7 +549,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // If this was full unswitching, we may have changed the nesting relationship // for this loop so hoist it to its correct parent if needed. if (FullUnswitch) - hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU); + hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -825,7 +842,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // We may have changed the nesting relationship for this loop so hoist it to // its correct parent if needed. - hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU); + hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); @@ -1909,7 +1926,7 @@ static void unswitchNontrivialInvariants( // We can only unswitch switches, conditional branches with an invariant // condition, or combining invariant conditions with an instruction. - assert((SI || BI->isConditional()) && + assert((SI || (BI && BI->isConditional())) && "Can only unswitch switches and conditional branch!"); bool FullUnswitch = SI || BI->getCondition() == Invariants[0]; if (FullUnswitch) @@ -2141,17 +2158,21 @@ static void unswitchNontrivialInvariants( buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction, *ClonedPH, *LoopPH); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); + + if (MSSAU) { + DT.applyUpdates(DTUpdates); + DTUpdates.clear(); + + // Perform MSSA cloning updates. + for (auto &VMap : VMaps) + MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap, + /*IgnoreIncomingWithNoClones=*/true); + MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); + } } // Apply the updates accumulated above to get an up-to-date dominator tree. DT.applyUpdates(DTUpdates); - if (!FullUnswitch && MSSAU) { - // Update MSSA for partial unswitch, after DT update. - SmallVector<CFGUpdate, 1> Updates; - Updates.push_back( - {cfg::UpdateKind::Insert, SplitBB, ClonedPHs.begin()->second}); - MSSAU->applyInsertUpdates(Updates, DT); - } // Now that we have an accurate dominator tree, first delete the dead cloned // blocks so that we can accurately build any cloned loops. It is important to @@ -2256,7 +2277,7 @@ static void unswitchNontrivialInvariants( // First build LCSSA for this loop so that we can preserve it when // forming dedicated exits. We don't want to perturb some other loop's // LCSSA while doing that CFG edit. - formLCSSA(UpdateL, DT, &LI, nullptr); + formLCSSA(UpdateL, DT, &LI, SE); // For loops reached by this loop's original exit blocks we may // introduced new, non-dedicated exits. At least try to re-form dedicated @@ -2422,7 +2443,7 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, if (MSSAU) { MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI)); - MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::End); + MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator); if (VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); } @@ -2720,7 +2741,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, return Cost * (SuccessorsCount - 1); }; Instruction *BestUnswitchTI = nullptr; - int BestUnswitchCost; + int BestUnswitchCost = 0; ArrayRef<Value *> BestUnswitchInvariants; for (auto &TerminatorAndInvariants : UnswitchCandidates) { Instruction &TI = *TerminatorAndInvariants.first; @@ -2752,6 +2773,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, BestUnswitchInvariants = Invariants; } } + assert(BestUnswitchTI && "Failed to find loop unswitch candidate"); if (BestUnswitchCost >= UnswitchThreshold) { LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " @@ -2880,7 +2902,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); auto PA = getLoopPassPreservedAnalyses(); - if (EnableMSSALoopDependency) + if (AR.MSSA) PA.preserve<MemorySSAAnalysis>(); return PA; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 4544975a4887..623a8b711ed8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -27,7 +27,6 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" @@ -35,10 +34,12 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Utils/Local.h" #include <utility> using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp index 90f3a2aa46e1..677d86f8c7b4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/Sink.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" @@ -78,7 +79,7 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA, if (auto *Call = dyn_cast<CallBase>(Inst)) { // Convergent operations cannot be made control-dependent on additional // values. - if (Call->hasFnAttr(Attribute::Convergent)) + if (Call->isConvergent()) return false; for (Instruction *S : Stores) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp index e6db11f47ead..cd7bfb2f20dc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp @@ -283,12 +283,12 @@ static bool isSafeAndProfitableToSpeculateAroundPHI( int MatCost = IncomingConstantAndCostsAndCount.second.MatCost; int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost; if (IID) - FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(), - IncomingC->getType()); + FoldedCost += TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(), + IncomingC->getType()); else FoldedCost += - TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(), - IncomingC->getType()); + TTI.getIntImmCostInst(UserI->getOpcode(), Idx, + IncomingC->getValue(), IncomingC->getType()); // If we accumulate more folded cost for this incoming constant than // materialized cost, then we'll regress any edge with this constant so diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp index f9d027eb4a3b..c8d899bb4871 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -67,6 +67,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index a58c32cc5894..9f82b1263ebd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -60,7 +60,6 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -76,10 +75,12 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstdint> #include <limits> diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index e5400676c7e8..4ce4ce46f67a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -34,8 +34,10 @@ #include "llvm/IR/Use.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -65,7 +67,7 @@ static cl::opt<bool> ForceSkipUniformRegions( static cl::opt<bool> RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden, cl::desc("Allow relaxed uniform region checks"), - cl::init(false)); + cl::init(true)); // Definition of the complex types used in this pass. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index f0b79079d817..9f0ab9103d42 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -76,6 +76,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -341,7 +342,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { const DataLayout &DL = L->getModule()->getDataLayout(); if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) || !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(), - L->getAlignment(), DL, L)) + MaybeAlign(L->getAlignment()), DL, L)) return false; } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp index 707adf46d1f4..c8461fdc1608 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp @@ -12,6 +12,7 @@ #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp index ee0973002c47..0908b361a4d4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/AddDiscriminators.cpp @@ -63,6 +63,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -232,7 +233,7 @@ static bool addDiscriminators(Function &F) { LocationSet CallLocations; for (auto &I : B.getInstList()) { // We bypass intrinsic calls for the following two reasons: - // 1) We want to avoid a non-deterministic assigment of + // 1) We want to avoid a non-deterministic assignment of // discriminators. // 2) We want to minimize the number of base discriminators used. if (!isa<InvokeInst>(I) && (!isa<CallInst>(I) || isa<IntrinsicInst>(I))) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 5fa371377c85..c9eb4abfa21a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -170,7 +170,8 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) { bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, LoopInfo *LI, MemorySSAUpdater *MSSAU, - MemoryDependenceResults *MemDep) { + MemoryDependenceResults *MemDep, + bool PredecessorWithTwoSuccessors) { if (BB->hasAddressTaken()) return false; @@ -185,9 +186,24 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, return false; // Can't merge if there are multiple distinct successors. - if (PredBB->getUniqueSuccessor() != BB) + if (!PredecessorWithTwoSuccessors && PredBB->getUniqueSuccessor() != BB) return false; + // Currently only allow PredBB to have two predecessors, one being BB. + // Update BI to branch to BB's only successor instead of BB. + BranchInst *PredBB_BI; + BasicBlock *NewSucc = nullptr; + unsigned FallThruPath; + if (PredecessorWithTwoSuccessors) { + if (!(PredBB_BI = dyn_cast<BranchInst>(PredBB->getTerminator()))) + return false; + BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BB_JmpI || !BB_JmpI->isUnconditional()) + return false; + NewSucc = BB_JmpI->getSuccessor(0); + FallThruPath = PredBB_BI->getSuccessor(0) == BB ? 0 : 1; + } + // Can't merge if there is PHI loop. for (PHINode &PN : BB->phis()) for (Value *IncValue : PN.incoming_values()) @@ -227,34 +243,51 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, Updates.push_back({DominatorTree::Delete, PredBB, BB}); } - if (MSSAU) - MSSAU->moveAllAfterMergeBlocks(BB, PredBB, &*(BB->begin())); + Instruction *PTI = PredBB->getTerminator(); + Instruction *STI = BB->getTerminator(); + Instruction *Start = &*BB->begin(); + // If there's nothing to move, mark the starting instruction as the last + // instruction in the block. Terminator instruction is handled separately. + if (Start == STI) + Start = PTI; - // Delete the unconditional branch from the predecessor... - PredBB->getInstList().pop_back(); + // Move all definitions in the successor to the predecessor... + PredBB->getInstList().splice(PTI->getIterator(), BB->getInstList(), + BB->begin(), STI->getIterator()); + + if (MSSAU) + MSSAU->moveAllAfterMergeBlocks(BB, PredBB, Start); // Make all PHI nodes that referred to BB now refer to Pred as their // source... BB->replaceAllUsesWith(PredBB); - // Move all definitions in the successor to the predecessor... - PredBB->getInstList().splice(PredBB->end(), BB->getInstList()); - new UnreachableInst(BB->getContext(), BB); + if (PredecessorWithTwoSuccessors) { + // Delete the unconditional branch from BB. + BB->getInstList().pop_back(); - // Eliminate duplicate dbg.values describing the entry PHI node post-splice. - for (auto Incoming : IncomingValues) { - if (isa<Instruction>(*Incoming)) { - SmallVector<DbgValueInst *, 2> DbgValues; - SmallDenseSet<std::pair<DILocalVariable *, DIExpression *>, 2> - DbgValueSet; - llvm::findDbgValues(DbgValues, Incoming); - for (auto &DVI : DbgValues) { - auto R = DbgValueSet.insert({DVI->getVariable(), DVI->getExpression()}); - if (!R.second) - DVI->eraseFromParent(); - } - } + // Update branch in the predecessor. + PredBB_BI->setSuccessor(FallThruPath, NewSucc); + } else { + // Delete the unconditional branch from the predecessor. + PredBB->getInstList().pop_back(); + + // Move terminator instruction. + PredBB->getInstList().splice(PredBB->end(), BB->getInstList()); + + // Terminator may be a memory accessing instruction too. + if (MSSAU) + if (MemoryUseOrDef *MUD = cast_or_null<MemoryUseOrDef>( + MSSAU->getMemorySSA()->getMemoryAccess(PredBB->getTerminator()))) + MSSAU->moveToPlace(MUD, PredBB, MemorySSA::End); } + // Add unreachable to now empty BB. + new UnreachableInst(BB->getContext(), BB); + + // Eliminate duplicate/redundant dbg.values. This seems to be a good place to + // do that since we might end up with redundant dbg.values describing the + // entry PHI node post-splice. + RemoveRedundantDbgInstrs(PredBB); // Inherit predecessors name if it exists. if (!PredBB->hasName()) @@ -274,14 +307,131 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, "applying corresponding DTU updates."); DTU->applyUpdatesPermissive(Updates); DTU->deleteBB(BB); - } - - else { + } else { BB->eraseFromParent(); // Nuke BB if DTU is nullptr. } + return true; } +/// Remove redundant instructions within sequences of consecutive dbg.value +/// instructions. This is done using a backward scan to keep the last dbg.value +/// describing a specific variable/fragment. +/// +/// BackwardScan strategy: +/// ---------------------- +/// Given a sequence of consecutive DbgValueInst like this +/// +/// dbg.value ..., "x", FragmentX1 (*) +/// dbg.value ..., "y", FragmentY1 +/// dbg.value ..., "x", FragmentX2 +/// dbg.value ..., "x", FragmentX1 (**) +/// +/// then the instruction marked with (*) can be removed (it is guaranteed to be +/// obsoleted by the instruction marked with (**) as the latter instruction is +/// describing the same variable using the same fragment info). +/// +/// Possible improvements: +/// - Check fully overlapping fragments and not only identical fragments. +/// - Support dbg.addr, dbg.declare. dbg.label, and possibly other meta +/// instructions being part of the sequence of consecutive instructions. +static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) { + SmallVector<DbgValueInst *, 8> ToBeRemoved; + SmallDenseSet<DebugVariable> VariableSet; + for (auto &I : reverse(*BB)) { + if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) { + DebugVariable Key(DVI->getVariable(), + DVI->getExpression(), + DVI->getDebugLoc()->getInlinedAt()); + auto R = VariableSet.insert(Key); + // If the same variable fragment is described more than once it is enough + // to keep the last one (i.e. the first found since we for reverse + // iteration). + if (!R.second) + ToBeRemoved.push_back(DVI); + continue; + } + // Sequence with consecutive dbg.value instrs ended. Clear the map to + // restart identifying redundant instructions if case we find another + // dbg.value sequence. + VariableSet.clear(); + } + + for (auto &Instr : ToBeRemoved) + Instr->eraseFromParent(); + + return !ToBeRemoved.empty(); +} + +/// Remove redundant dbg.value instructions using a forward scan. This can +/// remove a dbg.value instruction that is redundant due to indicating that a +/// variable has the same value as already being indicated by an earlier +/// dbg.value. +/// +/// ForwardScan strategy: +/// --------------------- +/// Given two identical dbg.value instructions, separated by a block of +/// instructions that isn't describing the same variable, like this +/// +/// dbg.value X1, "x", FragmentX1 (**) +/// <block of instructions, none being "dbg.value ..., "x", ..."> +/// dbg.value X1, "x", FragmentX1 (*) +/// +/// then the instruction marked with (*) can be removed. Variable "x" is already +/// described as being mapped to the SSA value X1. +/// +/// Possible improvements: +/// - Keep track of non-overlapping fragments. +static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) { + SmallVector<DbgValueInst *, 8> ToBeRemoved; + DenseMap<DebugVariable, std::pair<Value *, DIExpression *> > VariableMap; + for (auto &I : *BB) { + if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) { + DebugVariable Key(DVI->getVariable(), + NoneType(), + DVI->getDebugLoc()->getInlinedAt()); + auto VMI = VariableMap.find(Key); + // Update the map if we found a new value/expression describing the + // variable, or if the variable wasn't mapped already. + if (VMI == VariableMap.end() || + VMI->second.first != DVI->getValue() || + VMI->second.second != DVI->getExpression()) { + VariableMap[Key] = { DVI->getValue(), DVI->getExpression() }; + continue; + } + // Found an identical mapping. Remember the instruction for later removal. + ToBeRemoved.push_back(DVI); + } + } + + for (auto &Instr : ToBeRemoved) + Instr->eraseFromParent(); + + return !ToBeRemoved.empty(); +} + +bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) { + bool MadeChanges = false; + // By using the "backward scan" strategy before the "forward scan" strategy we + // can remove both dbg.value (2) and (3) in a situation like this: + // + // (1) dbg.value V1, "x", DIExpression() + // ... + // (2) dbg.value V2, "x", DIExpression() + // (3) dbg.value V1, "x", DIExpression() + // + // The backward scan will remove (2), it is made obsolete by (3). After + // getting (2) out of the way, the foward scan will remove (3) since "x" + // already is described as having the value V1 at (1). + MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB); + MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB); + + if (MadeChanges) + LLVM_DEBUG(dbgs() << "Removed redundant dbg instrs from: " + << BB->getName() << "\n"); + return MadeChanges; +} + void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL, BasicBlock::iterator &BI, Value *V) { Instruction &I = *BI; @@ -365,11 +515,13 @@ llvm::SplitAllCriticalEdges(Function &F, BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, DominatorTree *DT, LoopInfo *LI, - MemorySSAUpdater *MSSAU) { + MemorySSAUpdater *MSSAU, const Twine &BBName) { BasicBlock::iterator SplitIt = SplitPt->getIterator(); while (isa<PHINode>(SplitIt) || SplitIt->isEHPad()) ++SplitIt; - BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split"); + std::string Name = BBName.str(); + BasicBlock *New = Old->splitBasicBlock( + SplitIt, Name.empty() ? Old->getName() + ".split" : Name); // The new block lives in whichever loop the old one did. This preserves // LCSSA as well, because we force the split point to be after any PHI nodes. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index f5e4b53f6d97..008cea333e6b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp index 27f110e24f9c..71316ce8f758 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BuildLibCalls.cpp @@ -88,6 +88,14 @@ static bool setDoesNotCapture(Function &F, unsigned ArgNo) { return true; } +static bool setDoesNotAlias(Function &F, unsigned ArgNo) { + if (F.hasParamAttribute(ArgNo, Attribute::NoAlias)) + return false; + F.addParamAttr(ArgNo, Attribute::NoAlias); + ++NumNoAlias; + return true; +} + static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) { if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly)) return false; @@ -175,6 +183,9 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { return Changed; case LibFunc_strcpy: case LibFunc_strncpy: + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotAlias(F, 1); + LLVM_FALLTHROUGH; case LibFunc_strcat: case LibFunc_strncat: Changed |= setReturnedArg(F, 0); @@ -249,12 +260,14 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { case LibFunc_sprintf: Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); + Changed |= setDoesNotAlias(F, 0); Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); return Changed; case LibFunc_snprintf: Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 0); + Changed |= setDoesNotAlias(F, 0); Changed |= setDoesNotCapture(F, 2); Changed |= setOnlyReadsMemory(F, 2); return Changed; @@ -291,11 +304,23 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { Changed |= setDoesNotCapture(F, 1); return Changed; case LibFunc_memcpy: + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotAlias(F, 1); + Changed |= setReturnedArg(F, 0); + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; case LibFunc_memmove: Changed |= setReturnedArg(F, 0); - LLVM_FALLTHROUGH; + Changed |= setDoesNotThrow(F); + Changed |= setDoesNotCapture(F, 1); + Changed |= setOnlyReadsMemory(F, 1); + return Changed; case LibFunc_mempcpy: case LibFunc_memccpy: + Changed |= setDoesNotAlias(F, 0); + Changed |= setDoesNotAlias(F, 1); Changed |= setDoesNotThrow(F); Changed |= setDoesNotCapture(F, 1); Changed |= setOnlyReadsMemory(F, 1); @@ -760,9 +785,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) { } } -bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, - LibFunc DoubleFn, LibFunc FloatFn, - LibFunc LongDoubleFn) { +bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty, + LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) { switch (Ty->getTypeID()) { case Type::HalfTyID: return false; @@ -775,10 +799,10 @@ bool llvm::hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, } } -StringRef llvm::getUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty, - LibFunc DoubleFn, LibFunc FloatFn, - LibFunc LongDoubleFn) { - assert(hasUnaryFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) && +StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty, + LibFunc DoubleFn, LibFunc FloatFn, + LibFunc LongDoubleFn) { + assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) && "Cannot get name for unavailable function!"); switch (Ty->getTypeID()) { @@ -827,6 +851,12 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL, B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI); } +Value *llvm::emitStrDup(Value *Ptr, IRBuilder<> &B, + const TargetLibraryInfo *TLI) { + return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(), + castToCStr(Ptr, B), B, TLI); +} + Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B, const TargetLibraryInfo *TLI) { Type *I8Ptr = B.getInt8PtrTy(); @@ -1045,24 +1075,28 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI, LibFunc LongDoubleFn, IRBuilder<> &B, const AttributeList &Attrs) { // Get the name of the function according to TLI. - StringRef Name = getUnaryFloatFn(TLI, Op->getType(), - DoubleFn, FloatFn, LongDoubleFn); + StringRef Name = getFloatFnName(TLI, Op->getType(), + DoubleFn, FloatFn, LongDoubleFn); return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs); } -Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, - IRBuilder<> &B, const AttributeList &Attrs) { +static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2, + StringRef Name, IRBuilder<> &B, + const AttributeList &Attrs) { assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall"); - SmallString<20> NameBuffer; - appendTypeSuffix(Op1, Name, NameBuffer); - Module *M = B.GetInsertBlock()->getModule(); - FunctionCallee Callee = M->getOrInsertFunction( - Name, Op1->getType(), Op1->getType(), Op2->getType()); - CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name); - CI->setAttributes(Attrs); + FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(), + Op1->getType(), Op2->getType()); + CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name); + + // The incoming attribute set may have come from a speculatable intrinsic, but + // is being replaced with a library call which is not allowed to be + // speculatable. + CI->setAttributes(Attrs.removeAttribute(B.getContext(), + AttributeList::FunctionIndex, + Attribute::Speculatable)); if (const Function *F = dyn_cast<Function>(Callee.getCallee()->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -1070,6 +1104,28 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, return CI; } +Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name, + IRBuilder<> &B, const AttributeList &Attrs) { + assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall"); + + SmallString<20> NameBuffer; + appendTypeSuffix(Op1, Name, NameBuffer); + + return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs); +} + +Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, + const TargetLibraryInfo *TLI, + LibFunc DoubleFn, LibFunc FloatFn, + LibFunc LongDoubleFn, IRBuilder<> &B, + const AttributeList &Attrs) { + // Get the name of the function according to TLI. + StringRef Name = getFloatFnName(TLI, Op1->getType(), + DoubleFn, FloatFn, LongDoubleFn); + + return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs); +} + Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B, const TargetLibraryInfo *TLI) { if (!TLI->has(LibFunc_putchar)) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp index df299f673f65..9a6761040bd8 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -448,13 +448,17 @@ bool llvm::bypassSlowDivision(BasicBlock *BB, DivCacheTy PerBBDivCache; bool MadeChange = false; - Instruction* Next = &*BB->begin(); + Instruction *Next = &*BB->begin(); while (Next != nullptr) { // We may add instructions immediately after I, but we want to skip over // them. - Instruction* I = Next; + Instruction *I = Next; Next = Next->getNextNode(); + // Ignore dead code to save time and avoid bugs. + if (I->hasNUses(0)) + continue; + FastDivInsertionTask Task(I, BypassWidths); if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { I->replaceAllUsesWith(Replacement); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp index 455fcbb1cf98..6b01c0c71d00 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp @@ -30,9 +30,10 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/CanonicalizeAliases.h" - #include "llvm/IR/Operator.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp index 1026c9d37038..75e8963303c2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -210,6 +210,21 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, RemapInstruction(&II, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, TypeMapper, Materializer); + + // Register all DICompileUnits of the old parent module in the new parent module + auto* OldModule = OldFunc->getParent(); + auto* NewModule = NewFunc->getParent(); + if (OldModule && NewModule && OldModule != NewModule && DIFinder.compile_unit_count()) { + auto* NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu"); + // Avoid multiple insertions of the same DICompileUnit to NMD. + SmallPtrSet<const void*, 8> Visited; + for (auto* Operand : NMD->operands()) + Visited.insert(Operand); + for (auto* Unit : DIFinder.compile_units()) + // VMap.MD()[Unit] == Unit + if (Visited.insert(Unit).second) + NMD->addOperand(Unit); + } } /// Return a copy of the specified function and add it to that function's diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp index 7ddf59becba9..2c8c3abb2922 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -48,7 +48,7 @@ std::unique_ptr<Module> llvm::CloneModule( function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) { // First off, we need to create the new module. std::unique_ptr<Module> New = - llvm::make_unique<Module>(M.getModuleIdentifier(), M.getContext()); + std::make_unique<Module>(M.getModuleIdentifier(), M.getContext()); New->setSourceFileName(M.getSourceFileName()); New->setDataLayout(M.getDataLayout()); New->setTargetTriple(M.getTargetTriple()); @@ -181,13 +181,25 @@ std::unique_ptr<Module> llvm::CloneModule( } // And named metadata.... + const auto* LLVM_DBG_CU = M.getNamedMetadata("llvm.dbg.cu"); for (Module::const_named_metadata_iterator I = M.named_metadata_begin(), E = M.named_metadata_end(); I != E; ++I) { const NamedMDNode &NMD = *I; NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName()); - for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i) - NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap)); + if (&NMD == LLVM_DBG_CU) { + // Do not insert duplicate operands. + SmallPtrSet<const void*, 8> Visited; + for (const auto* Operand : NewNMD->operands()) + Visited.insert(Operand); + for (const auto* Operand : NMD.operands()) { + auto* MappedOperand = MapMetadata(Operand, VMap); + if (Visited.insert(MappedOperand).second) + NewNMD->addOperand(MappedOperand); + } + } else + for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i) + NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap)); } return New; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp index fa6d3f8ae873..682af4a88d3e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -293,10 +293,8 @@ static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) { CommonExitBlock = Succ; continue; } - if (CommonExitBlock == Succ) - continue; - - return true; + if (CommonExitBlock != Succ) + return true; } return false; }; @@ -307,52 +305,79 @@ static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) { return CommonExitBlock; } -bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers( - Instruction *Addr) const { - AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets()); - Function *Func = (*Blocks.begin())->getParent(); - for (BasicBlock &BB : *Func) { - if (Blocks.count(&BB)) - continue; - for (Instruction &II : BB) { - if (isa<DbgInfoIntrinsic>(II)) - continue; +CodeExtractorAnalysisCache::CodeExtractorAnalysisCache(Function &F) { + for (BasicBlock &BB : F) { + for (Instruction &II : BB.instructionsWithoutDebug()) + if (auto *AI = dyn_cast<AllocaInst>(&II)) + Allocas.push_back(AI); - unsigned Opcode = II.getOpcode(); - Value *MemAddr = nullptr; - switch (Opcode) { - case Instruction::Store: - case Instruction::Load: { - if (Opcode == Instruction::Store) { - StoreInst *SI = cast<StoreInst>(&II); - MemAddr = SI->getPointerOperand(); - } else { - LoadInst *LI = cast<LoadInst>(&II); - MemAddr = LI->getPointerOperand(); - } - // Global variable can not be aliased with locals. - if (dyn_cast<Constant>(MemAddr)) - break; - Value *Base = MemAddr->stripInBoundsConstantOffsets(); - if (!isa<AllocaInst>(Base) || Base == AI) - return false; + findSideEffectInfoForBlock(BB); + } +} + +void CodeExtractorAnalysisCache::findSideEffectInfoForBlock(BasicBlock &BB) { + for (Instruction &II : BB.instructionsWithoutDebug()) { + unsigned Opcode = II.getOpcode(); + Value *MemAddr = nullptr; + switch (Opcode) { + case Instruction::Store: + case Instruction::Load: { + if (Opcode == Instruction::Store) { + StoreInst *SI = cast<StoreInst>(&II); + MemAddr = SI->getPointerOperand(); + } else { + LoadInst *LI = cast<LoadInst>(&II); + MemAddr = LI->getPointerOperand(); + } + // Global variable can not be aliased with locals. + if (dyn_cast<Constant>(MemAddr)) break; + Value *Base = MemAddr->stripInBoundsConstantOffsets(); + if (!isa<AllocaInst>(Base)) { + SideEffectingBlocks.insert(&BB); + return; } - default: { - IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II); - if (IntrInst) { - if (IntrInst->isLifetimeStartOrEnd()) - break; - return false; - } - // Treat all the other cases conservatively if it has side effects. - if (II.mayHaveSideEffects()) - return false; + BaseMemAddrs[&BB].insert(Base); + break; + } + default: { + IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II); + if (IntrInst) { + if (IntrInst->isLifetimeStartOrEnd()) + break; + SideEffectingBlocks.insert(&BB); + return; } + // Treat all the other cases conservatively if it has side effects. + if (II.mayHaveSideEffects()) { + SideEffectingBlocks.insert(&BB); + return; } } + } } +} +bool CodeExtractorAnalysisCache::doesBlockContainClobberOfAddr( + BasicBlock &BB, AllocaInst *Addr) const { + if (SideEffectingBlocks.count(&BB)) + return true; + auto It = BaseMemAddrs.find(&BB); + if (It != BaseMemAddrs.end()) + return It->second.count(Addr); + return false; +} + +bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers( + const CodeExtractorAnalysisCache &CEAC, Instruction *Addr) const { + AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets()); + Function *Func = (*Blocks.begin())->getParent(); + for (BasicBlock &BB : *Func) { + if (Blocks.count(&BB)) + continue; + if (CEAC.doesBlockContainClobberOfAddr(BB, AI)) + return false; + } return true; } @@ -415,7 +440,8 @@ CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) { // outline region. If there are not other untracked uses of the address, return // the pair of markers if found; otherwise return a pair of nullptr. CodeExtractor::LifetimeMarkerInfo -CodeExtractor::getLifetimeMarkers(Instruction *Addr, +CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC, + Instruction *Addr, BasicBlock *ExitBlock) const { LifetimeMarkerInfo Info; @@ -447,7 +473,7 @@ CodeExtractor::getLifetimeMarkers(Instruction *Addr, Info.HoistLifeEnd = !definedInRegion(Blocks, Info.LifeEnd); // Do legality check. if ((Info.SinkLifeStart || Info.HoistLifeEnd) && - !isLegalToShrinkwrapLifetimeMarkers(Addr)) + !isLegalToShrinkwrapLifetimeMarkers(CEAC, Addr)) return {}; // Check to see if we have a place to do hoisting, if not, bail. @@ -457,7 +483,8 @@ CodeExtractor::getLifetimeMarkers(Instruction *Addr, return Info; } -void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands, +void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC, + ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const { Function *Func = (*Blocks.begin())->getParent(); ExitBlock = getCommonExitBlock(Blocks); @@ -478,74 +505,104 @@ void CodeExtractor::findAllocas(ValueSet &SinkCands, ValueSet &HoistCands, return true; }; - for (BasicBlock &BB : *Func) { - if (Blocks.count(&BB)) + // Look up allocas in the original function in CodeExtractorAnalysisCache, as + // this is much faster than walking all the instructions. + for (AllocaInst *AI : CEAC.getAllocas()) { + BasicBlock *BB = AI->getParent(); + if (Blocks.count(BB)) continue; - for (Instruction &II : BB) { - auto *AI = dyn_cast<AllocaInst>(&II); - if (!AI) - continue; - LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(AI, ExitBlock); - bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo); - if (Moved) { - LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n"); - SinkCands.insert(AI); - continue; - } + // As a prior call to extractCodeRegion() may have shrinkwrapped the alloca, + // check whether it is actually still in the original function. + Function *AIFunc = BB->getParent(); + if (AIFunc != Func) + continue; - // Follow any bitcasts. - SmallVector<Instruction *, 2> Bitcasts; - SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo; - for (User *U : AI->users()) { - if (U->stripInBoundsConstantOffsets() == AI) { - Instruction *Bitcast = cast<Instruction>(U); - LifetimeMarkerInfo LMI = getLifetimeMarkers(Bitcast, ExitBlock); - if (LMI.LifeStart) { - Bitcasts.push_back(Bitcast); - BitcastLifetimeInfo.push_back(LMI); - continue; - } - } + LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(CEAC, AI, ExitBlock); + bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo); + if (Moved) { + LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n"); + SinkCands.insert(AI); + continue; + } - // Found unknown use of AI. - if (!definedInRegion(Blocks, U)) { - Bitcasts.clear(); - break; + // Follow any bitcasts. + SmallVector<Instruction *, 2> Bitcasts; + SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo; + for (User *U : AI->users()) { + if (U->stripInBoundsConstantOffsets() == AI) { + Instruction *Bitcast = cast<Instruction>(U); + LifetimeMarkerInfo LMI = getLifetimeMarkers(CEAC, Bitcast, ExitBlock); + if (LMI.LifeStart) { + Bitcasts.push_back(Bitcast); + BitcastLifetimeInfo.push_back(LMI); + continue; } } - // Either no bitcasts reference the alloca or there are unknown uses. - if (Bitcasts.empty()) - continue; + // Found unknown use of AI. + if (!definedInRegion(Blocks, U)) { + Bitcasts.clear(); + break; + } + } - LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n"); - SinkCands.insert(AI); - for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) { - Instruction *BitcastAddr = Bitcasts[I]; - const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I]; - assert(LMI.LifeStart && - "Unsafe to sink bitcast without lifetime markers"); - moveOrIgnoreLifetimeMarkers(LMI); - if (!definedInRegion(Blocks, BitcastAddr)) { - LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr - << "\n"); - SinkCands.insert(BitcastAddr); - } + // Either no bitcasts reference the alloca or there are unknown uses. + if (Bitcasts.empty()) + continue; + + LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n"); + SinkCands.insert(AI); + for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) { + Instruction *BitcastAddr = Bitcasts[I]; + const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I]; + assert(LMI.LifeStart && + "Unsafe to sink bitcast without lifetime markers"); + moveOrIgnoreLifetimeMarkers(LMI); + if (!definedInRegion(Blocks, BitcastAddr)) { + LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr + << "\n"); + SinkCands.insert(BitcastAddr); } } } } +bool CodeExtractor::isEligible() const { + if (Blocks.empty()) + return false; + BasicBlock *Header = *Blocks.begin(); + Function *F = Header->getParent(); + + // For functions with varargs, check that varargs handling is only done in the + // outlined function, i.e vastart and vaend are only used in outlined blocks. + if (AllowVarArgs && F->getFunctionType()->isVarArg()) { + auto containsVarArgIntrinsic = [](const Instruction &I) { + if (const CallInst *CI = dyn_cast<CallInst>(&I)) + if (const Function *Callee = CI->getCalledFunction()) + return Callee->getIntrinsicID() == Intrinsic::vastart || + Callee->getIntrinsicID() == Intrinsic::vaend; + return false; + }; + + for (auto &BB : *F) { + if (Blocks.count(&BB)) + continue; + if (llvm::any_of(BB, containsVarArgIntrinsic)) + return false; + } + } + return true; +} + void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &SinkCands) const { for (BasicBlock *BB : Blocks) { // If a used value is defined outside the region, it's an input. If an // instruction is used outside the region, it's an output. for (Instruction &II : *BB) { - for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; - ++OI) { - Value *V = *OI; + for (auto &OI : II.operands()) { + Value *V = OI; if (!SinkCands.count(V) && definedInCaller(Blocks, V)) Inputs.insert(V); } @@ -748,7 +805,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, dbgs() << ")\n"; }); - StructType *StructTy; + StructType *StructTy = nullptr; if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { StructTy = StructType::get(M->getContext(), paramTy); paramTy.clear(); @@ -904,12 +961,12 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs, // within the new function. This must be done before we lose track of which // blocks were originally in the code region. std::vector<User *> Users(header->user_begin(), header->user_end()); - for (unsigned i = 0, e = Users.size(); i != e; ++i) + for (auto &U : Users) // The BasicBlock which contains the branch is not in the region // modify the branch target to a new block - if (Instruction *I = dyn_cast<Instruction>(Users[i])) - if (I->isTerminator() && !Blocks.count(I->getParent()) && - I->getParent()->getParent() == oldFunction) + if (Instruction *I = dyn_cast<Instruction>(U)) + if (I->isTerminator() && I->getFunction() == oldFunction && + !Blocks.count(I->getParent())) I->replaceUsesOfWith(header, newHeader); return newFunction; @@ -1277,13 +1334,6 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) { // Insert this basic block into the new function newBlocks.push_back(Block); - - // Remove @llvm.assume calls that were moved to the new function from the - // old function's assumption cache. - if (AC) - for (auto &I : *Block) - if (match(&I, m_Intrinsic<Intrinsic::assume>())) - AC->unregisterAssumption(cast<CallInst>(&I)); } } @@ -1332,7 +1382,8 @@ void CodeExtractor::calculateNewCallTerminatorWeights( MDBuilder(TI->getContext()).createBranchWeights(BranchWeights)); } -Function *CodeExtractor::extractCodeRegion() { +Function * +CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) { if (!isEligible()) return nullptr; @@ -1341,27 +1392,6 @@ Function *CodeExtractor::extractCodeRegion() { BasicBlock *header = *Blocks.begin(); Function *oldFunction = header->getParent(); - // For functions with varargs, check that varargs handling is only done in the - // outlined function, i.e vastart and vaend are only used in outlined blocks. - if (AllowVarArgs && oldFunction->getFunctionType()->isVarArg()) { - auto containsVarArgIntrinsic = [](Instruction &I) { - if (const CallInst *CI = dyn_cast<CallInst>(&I)) - if (const Function *F = CI->getCalledFunction()) - return F->getIntrinsicID() == Intrinsic::vastart || - F->getIntrinsicID() == Intrinsic::vaend; - return false; - }; - - for (auto &BB : *oldFunction) { - if (Blocks.count(&BB)) - continue; - if (llvm::any_of(BB, containsVarArgIntrinsic)) - return nullptr; - } - } - ValueSet inputs, outputs, SinkingCands, HoistingCands; - BasicBlock *CommonExit = nullptr; - // Calculate the entry frequency of the new function before we change the root // block. BlockFrequency EntryFreq; @@ -1375,6 +1405,15 @@ Function *CodeExtractor::extractCodeRegion() { } } + if (AC) { + // Remove @llvm.assume calls that were moved to the new function from the + // old function's assumption cache. + for (BasicBlock *Block : Blocks) + for (auto &I : *Block) + if (match(&I, m_Intrinsic<Intrinsic::assume>())) + AC->unregisterAssumption(cast<CallInst>(&I)); + } + // If we have any return instructions in the region, split those blocks so // that the return is not in the region. splitReturnBlocks(); @@ -1428,7 +1467,9 @@ Function *CodeExtractor::extractCodeRegion() { } newFuncRoot->getInstList().push_back(BranchI); - findAllocas(SinkingCands, HoistingCands, CommonExit); + ValueSet inputs, outputs, SinkingCands, HoistingCands; + BasicBlock *CommonExit = nullptr; + findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit); assert(HoistingCands.empty() || CommonExit); // Find inputs to, outputs from the code region. @@ -1563,5 +1604,17 @@ Function *CodeExtractor::extractCodeRegion() { }); LLVM_DEBUG(if (verifyFunction(*oldFunction)) report_fatal_error("verification of oldFunction failed!")); + LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, AC)) + report_fatal_error("Stale Asumption cache for old Function!")); return newFunction; } + +bool CodeExtractor::verifyAssumptionCache(const Function& F, + AssumptionCache *AC) { + for (auto AssumeVH : AC->assumptions()) { + CallInst *I = cast<CallInst>(AssumeVH); + if (I->getFunction() != &F) + return true; + } + return false; +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp new file mode 100644 index 000000000000..93395ac761ab --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp @@ -0,0 +1,189 @@ +//===- CodeMoverUtils.cpp - CodeMover Utilities ----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This family of functions perform movements on basic blocks, and instructions +// contained within a function. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/CodeMoverUtils.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" + +using namespace llvm; + +#define DEBUG_TYPE "codemover-utils" + +STATISTIC(HasDependences, + "Cannot move across instructions that has memory dependences"); +STATISTIC(MayThrowException, "Cannot move across instructions that may throw"); +STATISTIC(NotControlFlowEquivalent, + "Instructions are not control flow equivalent"); +STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported"); +STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported"); + +bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1, + const DominatorTree &DT, + const PostDominatorTree &PDT) { + return isControlFlowEquivalent(*I0.getParent(), *I1.getParent(), DT, PDT); +} + +bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1, + const DominatorTree &DT, + const PostDominatorTree &PDT) { + if (&BB0 == &BB1) + return true; + + return ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) || + (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0))); +} + +static bool reportInvalidCandidate(const Instruction &I, + llvm::Statistic &Stat) { + ++Stat; + LLVM_DEBUG(dbgs() << "Unable to move instruction: " << I << ". " + << Stat.getDesc()); + return false; +} + +/// Collect all instructions in between \p StartInst and \p EndInst, and store +/// them in \p InBetweenInsts. +static void +collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst, + SmallPtrSetImpl<Instruction *> &InBetweenInsts) { + assert(InBetweenInsts.empty() && "Expecting InBetweenInsts to be empty"); + + /// Get the next instructions of \p I, and push them to \p WorkList. + auto getNextInsts = [](Instruction &I, + SmallPtrSetImpl<Instruction *> &WorkList) { + if (Instruction *NextInst = I.getNextNode()) + WorkList.insert(NextInst); + else { + assert(I.isTerminator() && "Expecting a terminator instruction"); + for (BasicBlock *Succ : successors(&I)) + WorkList.insert(&Succ->front()); + } + }; + + SmallPtrSet<Instruction *, 10> WorkList; + getNextInsts(StartInst, WorkList); + while (!WorkList.empty()) { + Instruction *CurInst = *WorkList.begin(); + WorkList.erase(CurInst); + + if (CurInst == &EndInst) + continue; + + if (!InBetweenInsts.insert(CurInst).second) + continue; + + getNextInsts(*CurInst, WorkList); + } +} + +bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint, + const DominatorTree &DT, + const PostDominatorTree &PDT, + DependenceInfo &DI) { + // Cannot move itself before itself. + if (&I == &InsertPoint) + return false; + + // Not moved. + if (I.getNextNode() == &InsertPoint) + return true; + + if (isa<PHINode>(I) || isa<PHINode>(InsertPoint)) + return reportInvalidCandidate(I, NotMovedPHINode); + + if (I.isTerminator()) + return reportInvalidCandidate(I, NotMovedTerminator); + + // TODO remove this limitation. + if (!isControlFlowEquivalent(I, InsertPoint, DT, PDT)) + return reportInvalidCandidate(I, NotControlFlowEquivalent); + + // As I and InsertPoint are control flow equivalent, if I dominates + // InsertPoint, then I comes before InsertPoint. + const bool MoveForward = DT.dominates(&I, &InsertPoint); + if (MoveForward) { + // When I is being moved forward, we need to make sure the InsertPoint + // dominates every users. Or else, a user may be using an undefined I. + for (const Use &U : I.uses()) + if (auto *UserInst = dyn_cast<Instruction>(U.getUser())) + if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U)) + return false; + } else { + // When I is being moved backward, we need to make sure all its opernads + // dominates the InsertPoint. Or else, an operand may be undefined for I. + for (const Value *Op : I.operands()) + if (auto *OpInst = dyn_cast<Instruction>(Op)) + if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint)) + return false; + } + + Instruction &StartInst = (MoveForward ? I : InsertPoint); + Instruction &EndInst = (MoveForward ? InsertPoint : I); + SmallPtrSet<Instruction *, 10> InstsToCheck; + collectInstructionsInBetween(StartInst, EndInst, InstsToCheck); + if (!MoveForward) + InstsToCheck.insert(&InsertPoint); + + // Check if there exists instructions which may throw, may synchonize, or may + // never return, from I to InsertPoint. + if (!isSafeToSpeculativelyExecute(&I)) + if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(), + [](Instruction *I) { + if (I->mayThrow()) + return true; + + const CallBase *CB = dyn_cast<CallBase>(I); + if (!CB) + return false; + if (!CB->hasFnAttr(Attribute::WillReturn)) + return true; + if (!CB->hasFnAttr(Attribute::NoSync)) + return true; + + return false; + })) { + return reportInvalidCandidate(I, MayThrowException); + } + + // Check if I has any output/flow/anti dependences with instructions from \p + // StartInst to \p EndInst. + if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(), + [&DI, &I](Instruction *CurInst) { + auto DepResult = DI.depends(&I, CurInst, true); + if (DepResult && + (DepResult->isOutput() || DepResult->isFlow() || + DepResult->isAnti())) + return true; + return false; + })) + return reportInvalidCandidate(I, HasDependences); + + return true; +} + +void llvm::moveInstsBottomUp(BasicBlock &FromBB, BasicBlock &ToBB, + const DominatorTree &DT, + const PostDominatorTree &PDT, DependenceInfo &DI) { + for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) { + Instruction *MovePos = ToBB.getFirstNonPHIOrDbg(); + Instruction &I = *It; + // Increment the iterator before modifying FromBB. + ++It; + + if (isSafeToMoveBefore(I, *MovePos, DT, PDT, DI)) + I.moveBefore(MovePos); + } +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp new file mode 100644 index 000000000000..b7b4bfa3734d --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Debugify.cpp @@ -0,0 +1,435 @@ +//===- Debugify.cpp - Attach synthetic debug info to everything -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file This pass attaches synthetic debug info to everything. It can be used +/// to create targeted tests for debug info preservation. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/Debugify.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +namespace { + +cl::opt<bool> Quiet("debugify-quiet", + cl::desc("Suppress verbose debugify output")); + +raw_ostream &dbg() { return Quiet ? nulls() : errs(); } + +uint64_t getAllocSizeInBits(Module &M, Type *Ty) { + return Ty->isSized() ? M.getDataLayout().getTypeAllocSizeInBits(Ty) : 0; +} + +bool isFunctionSkipped(Function &F) { + return F.isDeclaration() || !F.hasExactDefinition(); +} + +/// Find the basic block's terminating instruction. +/// +/// Special care is needed to handle musttail and deopt calls, as these behave +/// like (but are in fact not) terminators. +Instruction *findTerminatingInstruction(BasicBlock &BB) { + if (auto *I = BB.getTerminatingMustTailCall()) + return I; + if (auto *I = BB.getTerminatingDeoptimizeCall()) + return I; + return BB.getTerminator(); +} + +bool applyDebugifyMetadata(Module &M, + iterator_range<Module::iterator> Functions, + StringRef Banner) { + // Skip modules with debug info. + if (M.getNamedMetadata("llvm.dbg.cu")) { + dbg() << Banner << "Skipping module with debug info\n"; + return false; + } + + DIBuilder DIB(M); + LLVMContext &Ctx = M.getContext(); + + // Get a DIType which corresponds to Ty. + DenseMap<uint64_t, DIType *> TypeCache; + auto getCachedDIType = [&](Type *Ty) -> DIType * { + uint64_t Size = getAllocSizeInBits(M, Ty); + DIType *&DTy = TypeCache[Size]; + if (!DTy) { + std::string Name = "ty" + utostr(Size); + DTy = DIB.createBasicType(Name, Size, dwarf::DW_ATE_unsigned); + } + return DTy; + }; + + unsigned NextLine = 1; + unsigned NextVar = 1; + auto File = DIB.createFile(M.getName(), "/"); + auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify", + /*isOptimized=*/true, "", 0); + + // Visit each instruction. + for (Function &F : Functions) { + if (isFunctionSkipped(F)) + continue; + + auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None)); + DISubprogram::DISPFlags SPFlags = + DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized; + if (F.hasPrivateLinkage() || F.hasInternalLinkage()) + SPFlags |= DISubprogram::SPFlagLocalToUnit; + auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine, + SPType, NextLine, DINode::FlagZero, SPFlags); + F.setSubprogram(SP); + for (BasicBlock &BB : F) { + // Attach debug locations. + for (Instruction &I : BB) + I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP)); + + // Inserting debug values into EH pads can break IR invariants. + if (BB.isEHPad()) + continue; + + // Find the terminating instruction, after which no debug values are + // attached. + Instruction *LastInst = findTerminatingInstruction(BB); + assert(LastInst && "Expected basic block with a terminator"); + + // Maintain an insertion point which can't be invalidated when updates + // are made. + BasicBlock::iterator InsertPt = BB.getFirstInsertionPt(); + assert(InsertPt != BB.end() && "Expected to find an insertion point"); + Instruction *InsertBefore = &*InsertPt; + + // Attach debug values. + for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) { + // Skip void-valued instructions. + if (I->getType()->isVoidTy()) + continue; + + // Phis and EH pads must be grouped at the beginning of the block. + // Only advance the insertion point when we finish visiting these. + if (!isa<PHINode>(I) && !I->isEHPad()) + InsertBefore = I->getNextNode(); + + std::string Name = utostr(NextVar++); + const DILocation *Loc = I->getDebugLoc().get(); + auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(), + getCachedDIType(I->getType()), + /*AlwaysPreserve=*/true); + DIB.insertDbgValueIntrinsic(I, LocalVar, DIB.createExpression(), Loc, + InsertBefore); + } + } + DIB.finalizeSubprogram(SP); + } + DIB.finalize(); + + // Track the number of distinct lines and variables. + NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify"); + auto *IntTy = Type::getInt32Ty(Ctx); + auto addDebugifyOperand = [&](unsigned N) { + NMD->addOperand(MDNode::get( + Ctx, ValueAsMetadata::getConstant(ConstantInt::get(IntTy, N)))); + }; + addDebugifyOperand(NextLine - 1); // Original number of lines. + addDebugifyOperand(NextVar - 1); // Original number of variables. + assert(NMD->getNumOperands() == 2 && + "llvm.debugify should have exactly 2 operands!"); + + // Claim that this synthetic debug info is valid. + StringRef DIVersionKey = "Debug Info Version"; + if (!M.getModuleFlag(DIVersionKey)) + M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION); + + return true; +} + +/// Return true if a mis-sized diagnostic is issued for \p DVI. +bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) { + // The size of a dbg.value's value operand should match the size of the + // variable it corresponds to. + // + // TODO: This, along with a check for non-null value operands, should be + // promoted to verifier failures. + Value *V = DVI->getValue(); + if (!V) + return false; + + // For now, don't try to interpret anything more complicated than an empty + // DIExpression. Eventually we should try to handle OP_deref and fragments. + if (DVI->getExpression()->getNumElements()) + return false; + + Type *Ty = V->getType(); + uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty); + Optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits(); + if (!ValueOperandSize || !DbgVarSize) + return false; + + bool HasBadSize = false; + if (Ty->isIntegerTy()) { + auto Signedness = DVI->getVariable()->getSignedness(); + if (Signedness && *Signedness == DIBasicType::Signedness::Signed) + HasBadSize = ValueOperandSize < *DbgVarSize; + } else { + HasBadSize = ValueOperandSize != *DbgVarSize; + } + + if (HasBadSize) { + dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize + << ", but its variable has size " << *DbgVarSize << ": "; + DVI->print(dbg()); + dbg() << "\n"; + } + return HasBadSize; +} + +bool checkDebugifyMetadata(Module &M, + iterator_range<Module::iterator> Functions, + StringRef NameOfWrappedPass, StringRef Banner, + bool Strip, DebugifyStatsMap *StatsMap) { + // Skip modules without debugify metadata. + NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify"); + if (!NMD) { + dbg() << Banner << "Skipping module without debugify metadata\n"; + return false; + } + + auto getDebugifyOperand = [&](unsigned Idx) -> unsigned { + return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0)) + ->getZExtValue(); + }; + assert(NMD->getNumOperands() == 2 && + "llvm.debugify should have exactly 2 operands!"); + unsigned OriginalNumLines = getDebugifyOperand(0); + unsigned OriginalNumVars = getDebugifyOperand(1); + bool HasErrors = false; + + // Track debug info loss statistics if able. + DebugifyStatistics *Stats = nullptr; + if (StatsMap && !NameOfWrappedPass.empty()) + Stats = &StatsMap->operator[](NameOfWrappedPass); + + BitVector MissingLines{OriginalNumLines, true}; + BitVector MissingVars{OriginalNumVars, true}; + for (Function &F : Functions) { + if (isFunctionSkipped(F)) + continue; + + // Find missing lines. + for (Instruction &I : instructions(F)) { + if (isa<DbgValueInst>(&I)) + continue; + + auto DL = I.getDebugLoc(); + if (DL && DL.getLine() != 0) { + MissingLines.reset(DL.getLine() - 1); + continue; + } + + if (!DL) { + dbg() << "ERROR: Instruction with empty DebugLoc in function "; + dbg() << F.getName() << " --"; + I.print(dbg()); + dbg() << "\n"; + HasErrors = true; + } + } + + // Find missing variables and mis-sized debug values. + for (Instruction &I : instructions(F)) { + auto *DVI = dyn_cast<DbgValueInst>(&I); + if (!DVI) + continue; + + unsigned Var = ~0U; + (void)to_integer(DVI->getVariable()->getName(), Var, 10); + assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable"); + bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI); + if (!HasBadSize) + MissingVars.reset(Var - 1); + HasErrors |= HasBadSize; + } + } + + // Print the results. + for (unsigned Idx : MissingLines.set_bits()) + dbg() << "WARNING: Missing line " << Idx + 1 << "\n"; + + for (unsigned Idx : MissingVars.set_bits()) + dbg() << "WARNING: Missing variable " << Idx + 1 << "\n"; + + // Update DI loss statistics. + if (Stats) { + Stats->NumDbgLocsExpected += OriginalNumLines; + Stats->NumDbgLocsMissing += MissingLines.count(); + Stats->NumDbgValuesExpected += OriginalNumVars; + Stats->NumDbgValuesMissing += MissingVars.count(); + } + + dbg() << Banner; + if (!NameOfWrappedPass.empty()) + dbg() << " [" << NameOfWrappedPass << "]"; + dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n'; + + // Strip the Debugify Metadata if required. + if (Strip) { + StripDebugInfo(M); + M.eraseNamedMetadata(NMD); + return true; + } + + return false; +} + +/// ModulePass for attaching synthetic debug info to everything, used with the +/// legacy module pass manager. +struct DebugifyModulePass : public ModulePass { + bool runOnModule(Module &M) override { + return applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: "); + } + + DebugifyModulePass() : ModulePass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + static char ID; // Pass identification. +}; + +/// FunctionPass for attaching synthetic debug info to instructions within a +/// single function, used with the legacy module pass manager. +struct DebugifyFunctionPass : public FunctionPass { + bool runOnFunction(Function &F) override { + Module &M = *F.getParent(); + auto FuncIt = F.getIterator(); + return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)), + "FunctionDebugify: "); + } + + DebugifyFunctionPass() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + static char ID; // Pass identification. +}; + +/// ModulePass for checking debug info inserted by -debugify, used with the +/// legacy module pass manager. +struct CheckDebugifyModulePass : public ModulePass { + bool runOnModule(Module &M) override { + return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass, + "CheckModuleDebugify", Strip, StatsMap); + } + + CheckDebugifyModulePass(bool Strip = false, StringRef NameOfWrappedPass = "", + DebugifyStatsMap *StatsMap = nullptr) + : ModulePass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass), + StatsMap(StatsMap) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + static char ID; // Pass identification. + +private: + bool Strip; + StringRef NameOfWrappedPass; + DebugifyStatsMap *StatsMap; +}; + +/// FunctionPass for checking debug info inserted by -debugify-function, used +/// with the legacy module pass manager. +struct CheckDebugifyFunctionPass : public FunctionPass { + bool runOnFunction(Function &F) override { + Module &M = *F.getParent(); + auto FuncIt = F.getIterator(); + return checkDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)), + NameOfWrappedPass, "CheckFunctionDebugify", + Strip, StatsMap); + } + + CheckDebugifyFunctionPass(bool Strip = false, + StringRef NameOfWrappedPass = "", + DebugifyStatsMap *StatsMap = nullptr) + : FunctionPass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass), + StatsMap(StatsMap) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + static char ID; // Pass identification. + +private: + bool Strip; + StringRef NameOfWrappedPass; + DebugifyStatsMap *StatsMap; +}; + +} // end anonymous namespace + +ModulePass *createDebugifyModulePass() { return new DebugifyModulePass(); } + +FunctionPass *createDebugifyFunctionPass() { + return new DebugifyFunctionPass(); +} + +PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) { + applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: "); + return PreservedAnalyses::all(); +} + +ModulePass *createCheckDebugifyModulePass(bool Strip, + StringRef NameOfWrappedPass, + DebugifyStatsMap *StatsMap) { + return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap); +} + +FunctionPass *createCheckDebugifyFunctionPass(bool Strip, + StringRef NameOfWrappedPass, + DebugifyStatsMap *StatsMap) { + return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap); +} + +PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M, + ModuleAnalysisManager &) { + checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false, + nullptr); + return PreservedAnalyses::all(); +} + +char DebugifyModulePass::ID = 0; +static RegisterPass<DebugifyModulePass> DM("debugify", + "Attach debug info to everything"); + +char CheckDebugifyModulePass::ID = 0; +static RegisterPass<CheckDebugifyModulePass> + CDM("check-debugify", "Check debug info from -debugify"); + +char DebugifyFunctionPass::ID = 0; +static RegisterPass<DebugifyFunctionPass> DF("debugify-function", + "Attach debug info to a function"); + +char CheckDebugifyFunctionPass::ID = 0; +static RegisterPass<CheckDebugifyFunctionPass> + CDF("check-debugify-function", "Check debug info from -debugify-function"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index 4aa40eeadda4..651f776a4915 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -13,6 +13,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" using namespace llvm; @@ -24,7 +25,7 @@ static void insertCall(Function &CurFn, StringRef Func, if (Func == "mcount" || Func == ".mcount" || - Func == "\01__gnu_mcount_nc" || + Func == "llvm.arm.gnu.eabi.mcount" || Func == "\01_mcount" || Func == "\01mcount" || Func == "__mcount" || diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp index 0e203f4e075d..ad36790b8c6a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Evaluator.cpp @@ -469,7 +469,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, return false; // Cannot handle array allocs. } Type *Ty = AI->getAllocatedType(); - AllocaTmps.push_back(llvm::make_unique<GlobalVariable>( + AllocaTmps.push_back(std::make_unique<GlobalVariable>( Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty), AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal, AI->getType()->getPointerAddressSpace())); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/FlattenCFG.cpp index 0c52e6f3703b..893f23eb6048 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/FlattenCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -67,7 +67,7 @@ public: /// Before: /// ...... /// %cmp10 = fcmp une float %tmp1, %tmp2 -/// br i1 %cmp1, label %if.then, label %lor.rhs +/// br i1 %cmp10, label %if.then, label %lor.rhs /// /// lor.rhs: /// ...... @@ -251,8 +251,8 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { bool EverChanged = false; for (; CurrBlock != FirstCondBlock; CurrBlock = CurrBlock->getSinglePredecessor()) { - BranchInst *BI = dyn_cast<BranchInst>(CurrBlock->getTerminator()); - CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition()); + auto *BI = cast<BranchInst>(CurrBlock->getTerminator()); + auto *CI = dyn_cast<CmpInst>(BI->getCondition()); if (!CI) continue; @@ -278,7 +278,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { // Do the transformation. BasicBlock *CB; - BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator()); + BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator()); bool Iteration = true; IRBuilder<>::InsertPointGuard Guard(Builder); Value *PC = PBI->getCondition(); @@ -444,7 +444,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { FirstEntryBlock->getInstList().pop_back(); FirstEntryBlock->getInstList() .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList()); - BranchInst *PBI = dyn_cast<BranchInst>(FirstEntryBlock->getTerminator()); + BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator()); Value *CC = PBI->getCondition(); BasicBlock *SaveInsertBB = Builder.GetInsertBlock(); BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint(); @@ -453,6 +453,16 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) { PBI->replaceUsesOfWith(CC, NC); Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt); + // Handle PHI node to replace its predecessors to FirstEntryBlock. + for (BasicBlock *Succ : successors(PBI)) { + for (PHINode &Phi : Succ->phis()) { + for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) { + if (Phi.getIncomingBlock(i) == SecondEntryBlock) + Phi.setIncomingBlock(i, FirstEntryBlock); + } + } + } + // Remove IfTrue1 if (IfTrue1 != FirstEntryBlock) { IfTrue1->dropAllReferences(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index c9cc0990f237..26d48ee0d23f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -12,13 +12,16 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/FunctionImportUtils.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/InstIterator.h" using namespace llvm; /// Checks if we should import SGV as a definition, otherwise import as a /// declaration. bool FunctionImportGlobalProcessing::doImportAsDefinition( - const GlobalValue *SGV, SetVector<GlobalValue *> *GlobalsToImport) { + const GlobalValue *SGV) { + if (!isPerformingImport()) + return false; // Only import the globals requested for importing. if (!GlobalsToImport->count(const_cast<GlobalValue *>(SGV))) @@ -31,16 +34,8 @@ bool FunctionImportGlobalProcessing::doImportAsDefinition( return true; } -bool FunctionImportGlobalProcessing::doImportAsDefinition( - const GlobalValue *SGV) { - if (!isPerformingImport()) - return false; - return FunctionImportGlobalProcessing::doImportAsDefinition(SGV, - GlobalsToImport); -} - bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal( - const GlobalValue *SGV) { + const GlobalValue *SGV, ValueInfo VI) { assert(SGV->hasLocalLinkage()); // Both the imported references and the original local variable must // be promoted. @@ -65,7 +60,7 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal( // (so the source file name and resulting GUID is the same). Find the one // in this module. auto Summary = ImportIndex.findSummaryInModule( - SGV->getGUID(), SGV->getParent()->getModuleIdentifier()); + VI, SGV->getParent()->getModuleIdentifier()); assert(Summary && "Missing summary for global value when exporting"); auto Linkage = Summary->linkage(); if (!GlobalValue::isLocalLinkage(Linkage)) { @@ -91,18 +86,15 @@ bool FunctionImportGlobalProcessing::isNonRenamableLocal( } #endif -std::string FunctionImportGlobalProcessing::getName(const GlobalValue *SGV, - bool DoPromote) { +std::string +FunctionImportGlobalProcessing::getPromotedName(const GlobalValue *SGV) { + assert(SGV->hasLocalLinkage()); // For locals that must be promoted to global scope, ensure that // the promoted name uniquely identifies the copy in the original module, - // using the ID assigned during combined index creation. When importing, - // we rename all locals (not just those that are promoted) in order to - // avoid naming conflicts between locals imported from different modules. - if (SGV->hasLocalLinkage() && (DoPromote || isPerformingImport())) - return ModuleSummaryIndex::getGlobalNameForLocal( - SGV->getName(), - ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier())); - return SGV->getName(); + // using the ID assigned during combined index creation. + return ModuleSummaryIndex::getGlobalNameForLocal( + SGV->getName(), + ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier())); } GlobalValue::LinkageTypes @@ -210,7 +202,7 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { if (Function *F = dyn_cast<Function>(&GV)) { if (!F->isDeclaration()) { for (auto &S : VI.getSummaryList()) { - FunctionSummary *FS = dyn_cast<FunctionSummary>(S->getBaseObject()); + auto *FS = cast<FunctionSummary>(S->getBaseObject()); if (FS->modulePath() == M.getModuleIdentifier()) { F->setEntryCount(Function::ProfileCount(FS->entryCount(), Function::PCT_Synthetic)); @@ -229,6 +221,11 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { } } + // We should always have a ValueInfo (i.e. GV in index) for definitions when + // we are exporting, and also when importing that value. + assert(VI || GV.isDeclaration() || + (isPerformingImport() && !doImportAsDefinition(&GV))); + // Mark read/write-only variables which can be imported with specific // attribute. We can't internalize them now because IRMover will fail // to link variable definitions to their external declarations during @@ -238,27 +235,42 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) { // If global value dead stripping is not enabled in summary then // propagateConstants hasn't been run. We can't internalize GV // in such case. - if (!GV.isDeclaration() && VI && ImportIndex.withGlobalValueDeadStripping()) { - const auto &SL = VI.getSummaryList(); - auto *GVS = SL.empty() ? nullptr : dyn_cast<GlobalVarSummary>(SL[0].get()); - // At this stage "maybe" is "definitely" - if (GVS && (GVS->maybeReadOnly() || GVS->maybeWriteOnly())) - cast<GlobalVariable>(&GV)->addAttribute("thinlto-internalize"); + if (!GV.isDeclaration() && VI && ImportIndex.withAttributePropagation()) { + if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) { + // We can have more than one local with the same GUID, in the case of + // same-named locals in different but same-named source files that were + // compiled in their respective directories (so the source file name + // and resulting GUID is the same). Find the one in this module. + // Handle the case where there is no summary found in this module. That + // can happen in the distributed ThinLTO backend, because the index only + // contains summaries from the source modules if they are being imported. + // We might have a non-null VI and get here even in that case if the name + // matches one in this module (e.g. weak or appending linkage). + auto *GVS = dyn_cast_or_null<GlobalVarSummary>( + ImportIndex.findSummaryInModule(VI, M.getModuleIdentifier())); + if (GVS && + (ImportIndex.isReadOnly(GVS) || ImportIndex.isWriteOnly(GVS))) { + V->addAttribute("thinlto-internalize"); + // Objects referenced by writeonly GV initializer should not be + // promoted, because there is no any kind of read access to them + // on behalf of this writeonly GV. To avoid promotion we convert + // GV initializer to 'zeroinitializer'. This effectively drops + // references in IR module (not in combined index), so we can + // ignore them when computing import. We do not export references + // of writeonly object. See computeImportForReferencedGlobals + if (ImportIndex.isWriteOnly(GVS)) + V->setInitializer(Constant::getNullValue(V->getValueType())); + } + } } - bool DoPromote = false; - if (GV.hasLocalLinkage() && - ((DoPromote = shouldPromoteLocalToGlobal(&GV)) || isPerformingImport())) { + if (GV.hasLocalLinkage() && shouldPromoteLocalToGlobal(&GV, VI)) { // Save the original name string before we rename GV below. auto Name = GV.getName().str(); - // Once we change the name or linkage it is difficult to determine - // again whether we should promote since shouldPromoteLocalToGlobal needs - // to locate the summary (based on GUID from name and linkage). Therefore, - // use DoPromote result saved above. - GV.setName(getName(&GV, DoPromote)); - GV.setLinkage(getLinkage(&GV, DoPromote)); - if (!GV.hasLocalLinkage()) - GV.setVisibility(GlobalValue::HiddenVisibility); + GV.setName(getPromotedName(&GV)); + GV.setLinkage(getLinkage(&GV, /* DoPromote */ true)); + assert(!GV.hasLocalLinkage()); + GV.setVisibility(GlobalValue::HiddenVisibility); // If we are renaming a COMDAT leader, ensure that we record the COMDAT // for later renaming as well. This is required for COFF. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp index 34c32d9c0c98..4cfc9358499a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/GuardUtils.cpp @@ -10,13 +10,17 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/GuardUtils.h" +#include "llvm/Analysis/GuardUtils.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; +using namespace llvm::PatternMatch; static cl::opt<uint32_t> PredicatePassBranchWeight( "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20), @@ -24,7 +28,7 @@ static cl::opt<uint32_t> PredicatePassBranchWeight( "reciprocal of this value (default = 1 << 20)")); void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic, - CallInst *Guard) { + CallInst *Guard, bool UseWC) { OperandBundleDef DeoptOB(*Guard->getOperandBundle(LLVMContext::OB_deopt)); SmallVector<Value *, 4> Args(std::next(Guard->arg_begin()), Guard->arg_end()); @@ -60,4 +64,63 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic, DeoptCall->setCallingConv(Guard->getCallingConv()); DeoptBlockTerm->eraseFromParent(); + + if (UseWC) { + // We want the guard to be expressed as explicit control flow, but still be + // widenable. For that, we add Widenable Condition intrinsic call to the + // guard's condition. + IRBuilder<> B(CheckBI); + auto *WC = B.CreateIntrinsic(Intrinsic::experimental_widenable_condition, + {}, {}, nullptr, "widenable_cond"); + CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC, + "exiplicit_guard_cond")); + assert(isWidenableBranch(CheckBI) && "sanity check"); + } +} + + +void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) { + assert(isWidenableBranch(WidenableBR) && "precondition"); + + // The tempting trivially option is to produce something like this: + // br (and oldcond, newcond) where oldcond is assumed to contain a widenable + // condition, but that doesn't match the pattern parseWidenableBranch expects + // so we have to be more sophisticated. + + Use *C, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB); + if (!C) { + // br (wc()), ... form + IRBuilder<> B(WidenableBR); + WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get())); + } else { + // br (wc & C), ... form + IRBuilder<> B(WidenableBR); + C->set(B.CreateAnd(NewCond, C->get())); + Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition()); + // Condition is only guaranteed to dominate branch + WCAnd->moveBefore(WidenableBR); + } + assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy"); +} + +void llvm::setWidenableBranchCond(BranchInst *WidenableBR, Value *NewCond) { + assert(isWidenableBranch(WidenableBR) && "precondition"); + + Use *C, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB); + if (!C) { + // br (wc()), ... form + IRBuilder<> B(WidenableBR); + WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get())); + } else { + // br (wc & C), ... form + Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition()); + // Condition is only guaranteed to dominate branch + WCAnd->moveBefore(WidenableBR); + C->set(NewCond); + } + assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy"); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp index 8041e66e6c4c..ea93f99d69e3 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp @@ -25,8 +25,8 @@ ImportedFunctionsInliningStatistics::createInlineGraphNode(const Function &F) { auto &ValueLookup = NodesMap[F.getName()]; if (!ValueLookup) { - ValueLookup = llvm::make_unique<InlineGraphNode>(); - ValueLookup->Imported = F.getMetadata("thinlto_src_module") != nullptr; + ValueLookup = std::make_unique<InlineGraphNode>(); + ValueLookup->Imported = F.hasMetadata("thinlto_src_module"); } return *ValueLookup; } @@ -64,7 +64,7 @@ void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) { if (F.isDeclaration()) continue; AllFunctions++; - ImportedFunctions += int(F.getMetadata("thinlto_src_module") != nullptr); + ImportedFunctions += int(F.hasMetadata("thinlto_src_module")); } } static std::string getStatString(const char *Msg, int32_t Fraction, int32_t All, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp new file mode 100644 index 000000000000..9192e74b9ace --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp @@ -0,0 +1,186 @@ +//===- InjectTLIMAppings.cpp - TLI to VFABI attribute injection ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Populates the VFABI attribute with the scalar-to-vector mappings +// from the TargetLibraryInfo. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/InjectTLIMappings.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "inject-tli-mappings" + +STATISTIC(NumCallInjected, + "Number of calls in which the mappings have been injected."); + +STATISTIC(NumVFDeclAdded, + "Number of function declarations that have been added."); +STATISTIC(NumCompUsedAdded, + "Number of `@llvm.compiler.used` operands that have been added."); + +/// Helper function to map the TLI name to a strings that holds +/// scalar-to-vector mapping. +/// +/// _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>) +/// +/// where: +/// +/// <isa> = "_LLVM_" +/// <mask> = "N". Note: TLI does not support masked interfaces. +/// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor` +/// field of the `VecDesc` struct. +/// <vparams> = "v", as many as are the number of parameters of CI. +/// <scalarname> = the name of the scalar function called by CI. +/// <vectorname> = the name of the vector function mapped by the TLI. +static std::string mangleTLIName(StringRef VectorName, const CallInst &CI, + unsigned VF) { + SmallString<256> Buffer; + llvm::raw_svector_ostream Out(Buffer); + Out << "_ZGV" << VFABI::_LLVM_ << "N" << VF; + for (unsigned I = 0; I < CI.getNumArgOperands(); ++I) + Out << "v"; + Out << "_" << CI.getCalledFunction()->getName() << "(" << VectorName << ")"; + return Out.str(); +} + +/// A helper function for converting Scalar types to vector types. +/// If the incoming type is void, we return void. If the VF is 1, we return +/// the scalar type. +static Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, {VF, isScalable}); +} + +/// A helper function that adds the vector function declaration that +/// vectorizes the CallInst CI with a vectorization factor of VF +/// lanes. The TLI assumes that all parameters and the return type of +/// CI (other than void) need to be widened to a VectorType of VF +/// lanes. +static void addVariantDeclaration(CallInst &CI, const unsigned VF, + const StringRef VFName) { + Module *M = CI.getModule(); + + // Add function declaration. + Type *RetTy = ToVectorTy(CI.getType(), VF); + SmallVector<Type *, 4> Tys; + for (Value *ArgOperand : CI.arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + assert(!CI.getFunctionType()->isVarArg() && + "VarArg functions are not supported."); + FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false); + Function *VectorF = + Function::Create(FTy, Function::ExternalLinkage, VFName, M); + VectorF->copyAttributesFrom(CI.getCalledFunction()); + ++NumVFDeclAdded; + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName + << "` of type " << *(VectorF->getType()) << "\n"); + + // Make function declaration (without a body) "sticky" in the IR by + // listing it in the @llvm.compiler.used intrinsic. + assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` " + "only on declarations."); + appendToCompilerUsed(*M, {VectorF}); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName + << "` to `@llvm.compiler.used`.\n"); + ++NumCompUsedAdded; +} + +static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) { + // This is needed to make sure we don't query the TLI for calls to + // bitcast of function pointers, like `%call = call i32 (i32*, ...) + // bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i)`, + // as such calls make the `isFunctionVectorizable` raise an + // exception. + if (CI.isNoBuiltin() || !CI.getCalledFunction()) + return; + + const std::string ScalarName = CI.getCalledFunction()->getName(); + // Nothing to be done if the TLI thinks the function is not + // vectorizable. + if (!TLI.isFunctionVectorizable(ScalarName)) + return; + SmallVector<std::string, 8> Mappings; + VFABI::getVectorVariantNames(CI, Mappings); + Module *M = CI.getModule(); + const SetVector<StringRef> OriginalSetOfMappings(Mappings.begin(), + Mappings.end()); + // All VFs in the TLI are powers of 2. + for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF; + VF *= 2) { + const std::string TLIName = TLI.getVectorizedFunction(ScalarName, VF); + if (!TLIName.empty()) { + std::string MangledName = mangleTLIName(TLIName, CI, VF); + if (!OriginalSetOfMappings.count(MangledName)) { + Mappings.push_back(MangledName); + ++NumCallInjected; + } + Function *VariantF = M->getFunction(TLIName); + if (!VariantF) + addVariantDeclaration(CI, VF, TLIName); + } + } + + VFABI::setVectorVariantNames(&CI, Mappings); +} + +static bool runImpl(const TargetLibraryInfo &TLI, Function &F) { + for (auto &I : instructions(F)) + if (auto CI = dyn_cast<CallInst>(&I)) + addMappingsFromTLI(TLI, *CI); + // Even if the pass adds IR attributes, the analyses are preserved. + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// New pass manager implementation. +//////////////////////////////////////////////////////////////////////////////// +PreservedAnalyses InjectTLIMappings::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F); + runImpl(TLI, F); + // Even if the pass adds IR attributes, the analyses are preserved. + return PreservedAnalyses::all(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Legacy PM Implementation. +//////////////////////////////////////////////////////////////////////////////// +bool InjectTLIMappingsLegacy::runOnFunction(Function &F) { + const TargetLibraryInfo &TLI = + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); + return runImpl(TLI, F); +} + +void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<TargetLibraryInfoWrapperPass>(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Legacy Pass manager initialization +//////////////////////////////////////////////////////////////////////////////// +char InjectTLIMappingsLegacy::ID = 0; + +INITIALIZE_PASS_BEGIN(InjectTLIMappingsLegacy, DEBUG_TYPE, + "Inject TLI Mappings", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_END(InjectTLIMappingsLegacy, DEBUG_TYPE, "Inject TLI Mappings", + false, false) + +FunctionPass *llvm::createInjectTLIMappingsLegacyPass() { + return new InjectTLIMappingsLegacy(); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp index a7f0f7ac5d61..6da612eb4e65 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1254,7 +1254,8 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M, // Always generate a memcpy of alignment 1 here because we don't know // the alignment of the src pointer. Other optimizations can infer // better alignment. - Builder.CreateMemCpy(Dst, /*DstAlign*/1, Src, /*SrcAlign*/1, Size); + Builder.CreateMemCpy(Dst, /*DstAlign*/ Align::None(), Src, + /*SrcAlign*/ Align::None(), Size); } /// When inlining a call site that has a byval argument, @@ -1293,16 +1294,16 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall, } // Create the alloca. If we have DataLayout, use nice alignment. - unsigned Align = DL.getPrefTypeAlignment(AggTy); + Align Alignment(DL.getPrefTypeAlignment(AggTy)); // If the byval had an alignment specified, we *must* use at least that // alignment, as it is required by the byval argument (and uses of the // pointer inside the callee). - Align = std::max(Align, ByValAlignment); + Alignment = max(Alignment, MaybeAlign(ByValAlignment)); - Value *NewAlloca = new AllocaInst(AggTy, DL.getAllocaAddrSpace(), - nullptr, Align, Arg->getName(), - &*Caller->begin()->begin()); + Value *NewAlloca = + new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment, + Arg->getName(), &*Caller->begin()->begin()); IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca)); // Uses of the argument in the function should use our new alloca @@ -1405,6 +1406,10 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, // other. DenseMap<const MDNode *, MDNode *> IANodes; + // Check if we are not generating inline line tables and want to use + // the call site location instead. + bool NoInlineLineTables = Fn->hasFnAttribute("no-inline-line-tables"); + for (; FI != Fn->end(); ++FI) { for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { @@ -1416,20 +1421,22 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, BI->setMetadata(LLVMContext::MD_loop, NewLoopID); } - if (DebugLoc DL = BI->getDebugLoc()) { - DebugLoc IDL = - inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes); - BI->setDebugLoc(IDL); - continue; - } + if (!NoInlineLineTables) + if (DebugLoc DL = BI->getDebugLoc()) { + DebugLoc IDL = + inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes); + BI->setDebugLoc(IDL); + continue; + } - if (CalleeHasDebugInfo) + if (CalleeHasDebugInfo && !NoInlineLineTables) continue; - // If the inlined instruction has no line number, make it look as if it - // originates from the call location. This is important for - // ((__always_inline__, __nodebug__)) functions which must use caller - // location for all instructions in their function body. + // If the inlined instruction has no line number, or if inline info + // is not being generated, make it look as if it originates from the call + // location. This is important for ((__always_inline, __nodebug__)) + // functions which must use caller location for all instructions in their + // function body. // Don't update static allocas, as they may get moved later. if (auto *AI = dyn_cast<AllocaInst>(BI)) @@ -1438,6 +1445,19 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, BI->setDebugLoc(TheCallDL); } + + // Remove debug info intrinsics if we're not keeping inline info. + if (NoInlineLineTables) { + BasicBlock::iterator BI = FI->begin(); + while (BI != FI->end()) { + if (isa<DbgInfoIntrinsic>(BI)) { + BI = BI->eraseFromParent(); + continue; + } + ++BI; + } + } + } } @@ -1453,7 +1473,7 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock, BlockFrequencyInfo *CalleeBFI, const BasicBlock &CalleeEntryBlock) { SmallPtrSet<BasicBlock *, 16> ClonedBBs; - for (auto const &Entry : VMap) { + for (auto Entry : VMap) { if (!isa<BasicBlock>(Entry.first) || !Entry.second) continue; auto *OrigBB = cast<BasicBlock>(Entry.first); @@ -1508,22 +1528,25 @@ void llvm::updateProfileCallee( else newEntryCount = priorEntryCount + entryDelta; - Callee->setEntryCount(newEntryCount); - // During inlining ? if (VMap) { uint64_t cloneEntryCount = priorEntryCount - newEntryCount; - for (auto const &Entry : *VMap) + for (auto Entry : *VMap) if (isa<CallInst>(Entry.first)) if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second)) CI->updateProfWeight(cloneEntryCount, priorEntryCount); } - for (BasicBlock &BB : *Callee) - // No need to update the callsite if it is pruned during inlining. - if (!VMap || VMap->count(&BB)) - for (Instruction &I : BB) - if (CallInst *CI = dyn_cast<CallInst>(&I)) - CI->updateProfWeight(newEntryCount, priorEntryCount); + + if (entryDelta) { + Callee->setEntryCount(newEntryCount); + + for (BasicBlock &BB : *Callee) + // No need to update the callsite if it is pruned during inlining. + if (!VMap || VMap->count(&BB)) + for (Instruction &I : BB) + if (CallInst *CI = dyn_cast<CallInst>(&I)) + CI->updateProfWeight(newEntryCount, priorEntryCount); + } } /// This function inlines the called function into the basic block of the @@ -1842,6 +1865,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Scan for the block of allocas that we can move over, and move them // all at once. while (isa<AllocaInst>(I) && + !cast<AllocaInst>(I)->use_empty() && allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) { IFI.StaticAllocas.push_back(cast<AllocaInst>(I)); ++I; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp index 6c4fc1ceb991..aac0b55801c4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/InstructionNamer.cpp @@ -15,6 +15,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp index 29e7c5260f46..5746d69260d5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LCSSA.cpp @@ -43,7 +43,9 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PredIteratorCache.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -74,7 +76,8 @@ static bool isExitBlock(BasicBlock *BB, /// that are outside the current loop. If so, insert LCSSA PHI nodes and /// rewrite the uses. bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, - DominatorTree &DT, LoopInfo &LI) { + DominatorTree &DT, LoopInfo &LI, + ScalarEvolution *SE) { SmallVector<Use *, 16> UsesToRewrite; SmallSetVector<PHINode *, 16> PHIsToRemove; PredIteratorCache PredCache; @@ -134,6 +137,11 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, SSAUpdater SSAUpdate(&InsertedPHIs); SSAUpdate.Initialize(I->getType(), I->getName()); + // Force re-computation of I, as some users now need to use the new PHI + // node. + if (SE) + SE->forgetValue(I); + // Insert the LCSSA phi's into all of the exit blocks dominated by the // value, and add them to the Phi's map. for (BasicBlock *ExitBB : ExitBlocks) { @@ -192,9 +200,6 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, UserBB = PN->getIncomingBlock(*UseToRewrite); if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) { - // Tell the VHs that the uses changed. This updates SCEV's caches. - if (UseToRewrite->get()->hasValueHandle()) - ValueHandleBase::ValueIsRAUWd(*UseToRewrite, &UserBB->front()); UseToRewrite->set(&UserBB->front()); continue; } @@ -202,10 +207,6 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist, // If we added a single PHI, it must dominate all uses and we can directly // rename it. if (AddedPHIs.size() == 1) { - // Tell the VHs that the uses changed. This updates SCEV's caches. - // We might call ValueIsRAUWd multiple times for the same value. - if (UseToRewrite->get()->hasValueHandle()) - ValueHandleBase::ValueIsRAUWd(*UseToRewrite, AddedPHIs[0]); UseToRewrite->set(AddedPHIs[0]); continue; } @@ -368,7 +369,7 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI, Worklist.push_back(&I); } } - Changed = formLCSSAForInstructions(Worklist, DT, *LI); + Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE); // If we modified the code, remove any caches about the loop from SCEV to // avoid dangling entries. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp index 8c67d1dc6eb3..4c52fac6f7cb 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LibCallsShrinkWrap.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -533,7 +534,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI, } bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) { - auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); + auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; return runImpl(F, TLI, DT); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp index 39b6b889f91c..b2d511c7c9a9 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Local.cpp @@ -324,8 +324,14 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, Value *Address = IBI->getAddress(); IBI->eraseFromParent(); if (DeleteDeadConditions) + // Delete pointer cast instructions. RecursivelyDeleteTriviallyDeadInstructions(Address, TLI); + // Also zap the blockaddress constant if there are no users remaining, + // otherwise the destination is still marked as having its address taken. + if (BA->use_empty()) + BA->destroyConstant(); + // If we didn't find our destination in the IBI successor list, then we // have undefined behavior. Replace the unconditional branch with an // 'unreachable' instruction. @@ -633,17 +639,6 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, // Control Flow Graph Restructuring. // -/// RemovePredecessorAndSimplify - Like BasicBlock::removePredecessor, this -/// method is called when we're about to delete Pred as a predecessor of BB. If -/// BB contains any PHI nodes, this drops the entries in the PHI nodes for Pred. -/// -/// Unlike the removePredecessor method, this attempts to simplify uses of PHI -/// nodes that collapse into identity values. For example, if we have: -/// x = phi(1, 0, 0, 0) -/// y = and x, z -/// -/// .. and delete the predecessor corresponding to the '1', this will attempt to -/// recursively fold the and to 0. void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, DomTreeUpdater *DTU) { // This only adjusts blocks with PHI nodes. @@ -672,10 +667,6 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred, DTU->applyUpdatesPermissive({{DominatorTree::Delete, Pred, BB}}); } -/// MergeBasicBlockIntoOnlyPred - DestBB is a block with one predecessor and its -/// predecessor is known to have one successor (DestBB!). Eliminate the edge -/// between them, moving the instructions in the predecessor into DestBB and -/// deleting the predecessor block. void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, DomTreeUpdater *DTU) { @@ -755,15 +746,14 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, } } -/// CanMergeValues - Return true if we can choose one of these values to use -/// in place of the other. Note that we will always choose the non-undef -/// value to keep. +/// Return true if we can choose one of these values to use in place of the +/// other. Note that we will always choose the non-undef value to keep. static bool CanMergeValues(Value *First, Value *Second) { return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second); } -/// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an -/// almost-empty BB ending in an unconditional branch to Succ, into Succ. +/// Return true if we can fold BB, an almost-empty BB ending in an unconditional +/// branch to Succ, into Succ. /// /// Assumption: Succ is the single successor for BB. static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) { @@ -956,11 +946,6 @@ static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB, replaceUndefValuesInPhi(PN, IncomingValues); } -/// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an -/// unconditional branch, and contains no instructions other than PHI nodes, -/// potential side-effect free intrinsics and the branch. If possible, -/// eliminate BB by rewriting all the predecessors to branch to the successor -/// block and return true. If we can't transform, return false. bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, DomTreeUpdater *DTU) { assert(BB != &BB->getParent()->getEntryBlock() && @@ -1088,10 +1073,6 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, return true; } -/// EliminateDuplicatePHINodes - Check for and eliminate duplicate PHI -/// nodes in this block. This doesn't try to be clever about PHI nodes -/// which differ only in the order of the incoming values, but instcombine -/// orders them so it usually won't matter. bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { // This implementation doesn't currently consider undef operands // specially. Theoretically, two phis which are identical except for @@ -1151,10 +1132,10 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { /// often possible though. If alignment is important, a more reliable approach /// is to simply align all global variables and allocation instructions to /// their preferred alignment from the beginning. -static unsigned enforceKnownAlignment(Value *V, unsigned Align, +static unsigned enforceKnownAlignment(Value *V, unsigned Alignment, unsigned PrefAlign, const DataLayout &DL) { - assert(PrefAlign > Align); + assert(PrefAlign > Alignment); V = V->stripPointerCasts(); @@ -1165,36 +1146,36 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align, // stripPointerCasts recurses through infinite layers of bitcasts, // while computeKnownBits is not allowed to traverse more than 6 // levels. - Align = std::max(AI->getAlignment(), Align); - if (PrefAlign <= Align) - return Align; + Alignment = std::max(AI->getAlignment(), Alignment); + if (PrefAlign <= Alignment) + return Alignment; // If the preferred alignment is greater than the natural stack alignment // then don't round up. This avoids dynamic stack realignment. - if (DL.exceedsNaturalStackAlignment(PrefAlign)) - return Align; - AI->setAlignment(PrefAlign); + if (DL.exceedsNaturalStackAlignment(Align(PrefAlign))) + return Alignment; + AI->setAlignment(MaybeAlign(PrefAlign)); return PrefAlign; } if (auto *GO = dyn_cast<GlobalObject>(V)) { // TODO: as above, this shouldn't be necessary. - Align = std::max(GO->getAlignment(), Align); - if (PrefAlign <= Align) - return Align; + Alignment = std::max(GO->getAlignment(), Alignment); + if (PrefAlign <= Alignment) + return Alignment; // If there is a large requested alignment and we can, bump up the alignment // of the global. If the memory we set aside for the global may not be the // memory used by the final program then it is impossible for us to reliably // enforce the preferred alignment. if (!GO->canIncreaseAlignment()) - return Align; + return Alignment; - GO->setAlignment(PrefAlign); + GO->setAlignment(MaybeAlign(PrefAlign)); return PrefAlign; } - return Align; + return Alignment; } unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, @@ -1397,7 +1378,12 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, /// Determine whether this alloca is either a VLA or an array. static bool isArray(AllocaInst *AI) { return AI->isArrayAllocation() || - AI->getType()->getElementType()->isArrayTy(); + (AI->getAllocatedType() && AI->getAllocatedType()->isArrayTy()); +} + +/// Determine whether this alloca is a structure. +static bool isStructure(AllocaInst *AI) { + return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy(); } /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set @@ -1422,7 +1408,7 @@ bool llvm::LowerDbgDeclare(Function &F) { // stored on the stack, while the dbg.declare can only describe // the stack slot (and at a lexical-scope granularity). Later // passes will attempt to elide the stack slot. - if (!AI || isArray(AI)) + if (!AI || isArray(AI) || isStructure(AI)) continue; // A volatile load/store means that the alloca can't be elided anyway. @@ -1435,22 +1421,32 @@ bool llvm::LowerDbgDeclare(Function &F) { })) continue; - for (auto &AIUse : AI->uses()) { - User *U = AIUse.getUser(); - if (StoreInst *SI = dyn_cast<StoreInst>(U)) { - if (AIUse.getOperandNo() == 1) - ConvertDebugDeclareToDebugValue(DDI, SI, DIB); - } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) { - ConvertDebugDeclareToDebugValue(DDI, LI, DIB); - } else if (CallInst *CI = dyn_cast<CallInst>(U)) { - // This is a call by-value or some other instruction that takes a - // pointer to the variable. Insert a *value* intrinsic that describes - // the variable by dereferencing the alloca. - DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr); - auto *DerefExpr = - DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref); - DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr, NewLoc, - CI); + SmallVector<const Value *, 8> WorkList; + WorkList.push_back(AI); + while (!WorkList.empty()) { + const Value *V = WorkList.pop_back_val(); + for (auto &AIUse : V->uses()) { + User *U = AIUse.getUser(); + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (AIUse.getOperandNo() == 1) + ConvertDebugDeclareToDebugValue(DDI, SI, DIB); + } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + ConvertDebugDeclareToDebugValue(DDI, LI, DIB); + } else if (CallInst *CI = dyn_cast<CallInst>(U)) { + // This is a call by-value or some other instruction that takes a + // pointer to the variable. Insert a *value* intrinsic that describes + // the variable by dereferencing the alloca. + if (!CI->isLifetimeStartOrEnd()) { + DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr); + auto *DerefExpr = + DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref); + DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr, + NewLoc, CI); + } + } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) { + if (BI->getType()->isPointerTy()) + WorkList.push_back(BI); + } } } DDI->eraseFromParent(); @@ -1591,15 +1587,10 @@ static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress, DIExpr->getElement(0) != dwarf::DW_OP_deref) return; - // Insert the offset immediately after the first deref. + // Insert the offset before the first deref. // We could just change the offset argument of dbg.value, but it's unsigned... - if (Offset) { - SmallVector<uint64_t, 4> Ops; - Ops.push_back(dwarf::DW_OP_deref); - DIExpression::appendOffset(Ops, Offset); - Ops.append(DIExpr->elements_begin() + 1, DIExpr->elements_end()); - DIExpr = Builder.createExpression(Ops); - } + if (Offset) + DIExpr = DIExpression::prepend(DIExpr, 0, Offset); Builder.insertDbgValueIntrinsic(NewAddress, DIVar, DIExpr, Loc, DVI); DVI->eraseFromParent(); @@ -1630,6 +1621,11 @@ bool llvm::salvageDebugInfo(Instruction &I) { return salvageDebugInfoForDbgValues(I, DbgUsers); } +void llvm::salvageDebugInfoOrMarkUndef(Instruction &I) { + if (!salvageDebugInfo(I)) + replaceDbgUsesWithUndef(&I); +} + bool llvm::salvageDebugInfoForDbgValues( Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) { auto &Ctx = I.getContext(); @@ -1680,9 +1676,8 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, }; // initializer-list helper for applying operators to the source DIExpression. - auto applyOps = - [&](std::initializer_list<uint64_t> Opcodes) -> DIExpression * { - SmallVector<uint64_t, 8> Ops(Opcodes); + auto applyOps = [&](ArrayRef<uint64_t> Opcodes) -> DIExpression * { + SmallVector<uint64_t, 8> Ops(Opcodes.begin(), Opcodes.end()); return doSalvage(Ops); }; @@ -1690,8 +1685,21 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, // No-op casts and zexts are irrelevant for debug info. if (CI->isNoopCast(DL) || isa<ZExtInst>(&I)) return SrcDIExpr; - return nullptr; - } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + + Type *Type = CI->getType(); + // Casts other than Trunc or SExt to scalar types cannot be salvaged. + if (Type->isVectorTy() || (!isa<TruncInst>(&I) && !isa<SExtInst>(&I))) + return nullptr; + + Value *FromValue = CI->getOperand(0); + unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits(); + unsigned ToTypeBitSize = Type->getScalarSizeInBits(); + + return applyOps(DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize, + isa<SExtInst>(&I))); + } + + if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { unsigned BitWidth = M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace()); // Rewrite a constant GEP into a DIExpression. @@ -1746,7 +1754,7 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I, using DbgValReplacement = Optional<DIExpression *>; /// Point debug users of \p From to \p To using exprs given by \p RewriteExpr, -/// possibly moving/deleting users to prevent use-before-def. Returns true if +/// possibly moving/undefing users to prevent use-before-def. Returns true if /// changes are made. static bool rewriteDebugUsers( Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT, @@ -1759,7 +1767,7 @@ static bool rewriteDebugUsers( // Prevent use-before-def of To. bool Changed = false; - SmallPtrSet<DbgVariableIntrinsic *, 1> DeleteOrSalvage; + SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage; if (isa<Instruction>(&To)) { bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint; @@ -1774,14 +1782,14 @@ static bool rewriteDebugUsers( // Users which otherwise aren't dominated by the replacement value must // be salvaged or deleted. } else if (!DT.dominates(&DomPoint, DII)) { - DeleteOrSalvage.insert(DII); + UndefOrSalvage.insert(DII); } } } // Update debug users without use-before-def risk. for (auto *DII : Users) { - if (DeleteOrSalvage.count(DII)) + if (UndefOrSalvage.count(DII)) continue; LLVMContext &Ctx = DII->getContext(); @@ -1795,18 +1803,10 @@ static bool rewriteDebugUsers( Changed = true; } - if (!DeleteOrSalvage.empty()) { + if (!UndefOrSalvage.empty()) { // Try to salvage the remaining debug users. - Changed |= salvageDebugInfo(From); - - // Delete the debug users which weren't salvaged. - for (auto *DII : DeleteOrSalvage) { - if (DII->getVariableLocation() == &From) { - LLVM_DEBUG(dbgs() << "Erased UseBeforeDef: " << *DII << '\n'); - DII->eraseFromParent(); - Changed = true; - } - } + salvageDebugInfoOrMarkUndef(From); + Changed = true; } return Changed; @@ -1881,10 +1881,8 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To, return None; bool Signed = *Signedness == DIBasicType::Signedness::Signed; - dwarf::TypeKind TK = Signed ? dwarf::DW_ATE_signed : dwarf::DW_ATE_unsigned; - SmallVector<uint64_t, 8> Ops({dwarf::DW_OP_LLVM_convert, ToBits, TK, - dwarf::DW_OP_LLVM_convert, FromBits, TK}); - return DIExpression::appendToStack(DII.getExpression(), Ops); + return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits, + Signed); }; return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt); } @@ -1957,18 +1955,24 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap, return NumInstrsRemoved; } -/// changeToCall - Convert the specified invoke into a normal call. -static void changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr) { - SmallVector<Value*, 8> Args(II->arg_begin(), II->arg_end()); +CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) { + SmallVector<Value *, 8> Args(II->arg_begin(), II->arg_end()); SmallVector<OperandBundleDef, 1> OpBundles; II->getOperandBundlesAsDefs(OpBundles); - CallInst *NewCall = CallInst::Create( - II->getFunctionType(), II->getCalledValue(), Args, OpBundles, "", II); - NewCall->takeName(II); + CallInst *NewCall = CallInst::Create(II->getFunctionType(), + II->getCalledValue(), Args, OpBundles); NewCall->setCallingConv(II->getCallingConv()); NewCall->setAttributes(II->getAttributes()); NewCall->setDebugLoc(II->getDebugLoc()); NewCall->copyMetadata(*II); + return NewCall; +} + +/// changeToCall - Convert the specified invoke into a normal call. +void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) { + CallInst *NewCall = createCallMatchingInvoke(II); + NewCall->takeName(II); + NewCall->insertBefore(II); II->replaceAllUsesWith(NewCall); // Follow the call by a branch to the normal destination. @@ -2223,12 +2227,10 @@ void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) { /// removeUnreachableBlocks - Remove blocks that are not reachable, even /// if they are in a dead cycle. Return true if a change was made, false -/// otherwise. If `LVI` is passed, this function preserves LazyValueInfo -/// after modifying the CFG. -bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI, - DomTreeUpdater *DTU, +/// otherwise. +bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, MemorySSAUpdater *MSSAU) { - SmallPtrSet<BasicBlock*, 16> Reachable; + SmallPtrSet<BasicBlock *, 16> Reachable; bool Changed = markAliveBlocks(F, Reachable, DTU); // If there are unreachable blocks in the CFG... @@ -2236,21 +2238,21 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI, return Changed; assert(Reachable.size() < F.size()); - NumRemoved += F.size()-Reachable.size(); + NumRemoved += F.size() - Reachable.size(); SmallSetVector<BasicBlock *, 8> DeadBlockSet; - for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ++I) { - auto *BB = &*I; - if (Reachable.count(BB)) + for (BasicBlock &BB : F) { + // Skip reachable basic blocks + if (Reachable.find(&BB) != Reachable.end()) continue; - DeadBlockSet.insert(BB); + DeadBlockSet.insert(&BB); } if (MSSAU) MSSAU->removeBlocks(DeadBlockSet); // Loop over all of the basic blocks that are not reachable, dropping all of - // their internal references. Update DTU and LVI if available. + // their internal references. Update DTU if available. std::vector<DominatorTree::UpdateType> Updates; for (auto *BB : DeadBlockSet) { for (BasicBlock *Successor : successors(BB)) { @@ -2259,26 +2261,18 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI, if (DTU) Updates.push_back({DominatorTree::Delete, BB, Successor}); } - if (LVI) - LVI->eraseBlock(BB); BB->dropAllReferences(); - } - for (Function::iterator I = ++F.begin(); I != F.end();) { - auto *BB = &*I; - if (Reachable.count(BB)) { - ++I; - continue; - } if (DTU) { - // Remove the terminator of BB to clear the successor list of BB. - if (BB->getTerminator()) - BB->getInstList().pop_back(); + Instruction *TI = BB->getTerminator(); + assert(TI && "Basic block should have a terminator"); + // Terminators like invoke can have users. We have to replace their users, + // before removing them. + if (!TI->use_empty()) + TI->replaceAllUsesWith(UndefValue::get(TI->getType())); + TI->eraseFromParent(); new UnreachableInst(BB->getContext(), BB); assert(succ_empty(BB) && "The successor list of BB isn't empty before " "applying corresponding DTU updates."); - ++I; - } else { - I = F.getBasicBlockList().erase(I); } } @@ -2294,7 +2288,11 @@ bool llvm::removeUnreachableBlocks(Function &F, LazyValueInfo *LVI, } if (!Deleted) return false; + } else { + for (auto *BB : DeadBlockSet) + BB->eraseFromParent(); } + return true; } @@ -2363,6 +2361,9 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J, K->setMetadata(Kind, MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD)); break; + case LLVMContext::MD_preserve_access_index: + // Preserve !preserve.access.index in K. + break; } } // Set !invariant.group from J if J has it. If both instructions have it @@ -2385,10 +2386,61 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J, LLVMContext::MD_invariant_group, LLVMContext::MD_align, LLVMContext::MD_dereferenceable, LLVMContext::MD_dereferenceable_or_null, - LLVMContext::MD_access_group}; + LLVMContext::MD_access_group, LLVMContext::MD_preserve_access_index}; combineMetadata(K, J, KnownIDs, KDominatesJ); } +void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) { + SmallVector<std::pair<unsigned, MDNode *>, 8> MD; + Source.getAllMetadata(MD); + MDBuilder MDB(Dest.getContext()); + Type *NewType = Dest.getType(); + const DataLayout &DL = Source.getModule()->getDataLayout(); + for (const auto &MDPair : MD) { + unsigned ID = MDPair.first; + MDNode *N = MDPair.second; + // Note, essentially every kind of metadata should be preserved here! This + // routine is supposed to clone a load instruction changing *only its type*. + // The only metadata it makes sense to drop is metadata which is invalidated + // when the pointer type changes. This should essentially never be the case + // in LLVM, but we explicitly switch over only known metadata to be + // conservatively correct. If you are adding metadata to LLVM which pertains + // to loads, you almost certainly want to add it here. + switch (ID) { + case LLVMContext::MD_dbg: + case LLVMContext::MD_tbaa: + case LLVMContext::MD_prof: + case LLVMContext::MD_fpmath: + case LLVMContext::MD_tbaa_struct: + case LLVMContext::MD_invariant_load: + case LLVMContext::MD_alias_scope: + case LLVMContext::MD_noalias: + case LLVMContext::MD_nontemporal: + case LLVMContext::MD_mem_parallel_loop_access: + case LLVMContext::MD_access_group: + // All of these directly apply. + Dest.setMetadata(ID, N); + break; + + case LLVMContext::MD_nonnull: + copyNonnullMetadata(Source, N, Dest); + break; + + case LLVMContext::MD_align: + case LLVMContext::MD_dereferenceable: + case LLVMContext::MD_dereferenceable_or_null: + // These only directly apply if the new type is also a pointer. + if (NewType->isPointerTy()) + Dest.setMetadata(ID, N); + break; + + case LLVMContext::MD_range: + copyRangeMetadata(DL, Source, N, Dest); + break; + } + } +} + void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) { auto *ReplInst = dyn_cast<Instruction>(Repl); if (!ReplInst) @@ -2417,7 +2469,7 @@ void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) { LLVMContext::MD_noalias, LLVMContext::MD_range, LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, LLVMContext::MD_invariant_group, LLVMContext::MD_nonnull, - LLVMContext::MD_access_group}; + LLVMContext::MD_access_group, LLVMContext::MD_preserve_access_index}; combineMetadata(ReplInst, I, KnownIDs, false); } @@ -2539,7 +2591,7 @@ void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI, if (!NewTy->isPointerTy()) return; - unsigned BitWidth = DL.getIndexTypeSizeInBits(NewTy); + unsigned BitWidth = DL.getPointerTypeSizeInBits(NewTy); if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) { MDNode *NN = MDNode::get(OldLI.getContext(), None); NewLI.setMetadata(LLVMContext::MD_nonnull, NN); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 37389a695b45..c065e0269c64 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -76,6 +76,13 @@ private: }; } // end anonymous namespace +/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not +/// previously exist in the map, and the value was inserted. +static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) { + bool Inserted = VM.insert({K, V}).second; + assert(Inserted); + (void)Inserted; +} /// RewriteUsesOfClonedInstructions - We just cloned the instructions from the /// old header into the preheader. If there were uses of the values produced by /// these instruction that were outside of the loop, we have to insert PHI nodes @@ -300,7 +307,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // For PHI nodes, the value available in OldPreHeader is just the // incoming value from OldPreHeader. for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) - ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); + InsertNewValueIntoMap(ValueMap, PN, + PN->getIncomingValueForBlock(OrigPreheader)); // For the rest of the instructions, either hoist to the OrigPreheader if // possible or create a clone in the OldPreHeader if not. @@ -358,13 +366,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value // in the map. - ValueMap[Inst] = V; + InsertNewValueIntoMap(ValueMap, Inst, V); if (!C->mayHaveSideEffects()) { C->deleteValue(); C = nullptr; } } else { - ValueMap[Inst] = C; + InsertNewValueIntoMap(ValueMap, Inst, C); } if (C) { // Otherwise, stick the new instruction into the new block! @@ -376,7 +384,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { AC->registerAssumption(II); // MemorySSA cares whether the cloned instruction was inserted or not, and // not whether it can be remapped to a simplified value. - ValueMapMSSA[Inst] = C; + if (MSSAU) + InsertNewValueIntoMap(ValueMapMSSA, Inst, C); } } @@ -396,7 +405,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // Update MemorySSA before the rewrite call below changes the 1:1 // instruction:cloned_instruction_or_value mapping. if (MSSAU) { - ValueMapMSSA[OrigHeader] = OrigPreheader; + InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader); MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader, ValueMapMSSA); } @@ -615,30 +624,9 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) { LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " << LastExit->getName() << "\n"); - // Hoist the instructions from Latch into LastExit. - Instruction *FirstLatchInst = &*(Latch->begin()); - LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), - Latch->begin(), Jmp->getIterator()); - - // Update MemorySSA - if (MSSAU) - MSSAU->moveAllAfterMergeBlocks(Latch, LastExit, FirstLatchInst); - - unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; - BasicBlock *Header = Jmp->getSuccessor(0); - assert(Header == L->getHeader() && "expected a backward branch"); - - // Remove Latch from the CFG so that LastExit becomes the new Latch. - BI->setSuccessor(FallThruPath, Header); - Latch->replaceSuccessorsPhiUsesWith(LastExit); - Jmp->eraseFromParent(); - - // Nuke the Latch block. - assert(Latch->empty() && "unable to evacuate Latch"); - LI->removeBlock(Latch); - if (DT) - DT->eraseNode(Latch); - Latch->eraseFromParent(); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + MergeBlockIntoPredecessor(Latch, &DTU, LI, MSSAU, nullptr, + /*PredecessorWithTwoSuccessors=*/true); if (MSSAU && VerifyMemorySSA) MSSAU->getMemorySSA()->verifyMemorySSA(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 7e6da02d5707..28f88f39a712 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -67,6 +67,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils.h" @@ -808,7 +809,7 @@ bool LoopSimplify::runOnFunction(Function &F) { auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>(); if (MSSAAnalysis) { MSSA = &MSSAAnalysis->getMSSA(); - MSSAU = make_unique<MemorySSAUpdater>(MSSA); + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); } } @@ -835,12 +836,19 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F, DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F); ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F); AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F); + auto *MSSAAnalysis = AM.getCachedResult<MemorySSAAnalysis>(F); + std::unique_ptr<MemorySSAUpdater> MSSAU; + if (MSSAAnalysis) { + auto *MSSA = &MSSAAnalysis->getMSSA(); + MSSAU = std::make_unique<MemorySSAUpdater>(MSSA); + } + // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA - // after simplifying the loops. MemorySSA is not preserved either. + // after simplifying the loops. MemorySSA is preserved if it exists. for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) Changed |= - simplifyLoop(*I, DT, LI, SE, AC, nullptr, /*PreserveLCSSA*/ false); + simplifyLoop(*I, DT, LI, SE, AC, MSSAU.get(), /*PreserveLCSSA*/ false); if (!Changed) return PreservedAnalyses::all(); @@ -853,6 +861,8 @@ PreservedAnalyses LoopSimplifyPass::run(Function &F, PA.preserve<SCEVAA>(); PA.preserve<ScalarEvolutionAnalysis>(); PA.preserve<DependenceAnalysis>(); + if (MSSAAnalysis) + PA.preserve<MemorySSAAnalysis>(); // BPI maps conditional terminators to probabilities, LoopSimplify can insert // blocks, but it does so only by splitting existing blocks and edges. This // results in the interesting property that all new terminators inserted are diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 4a1edb3700c0..4b94b371e70a 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -22,17 +22,18 @@ #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SimplifyIndVar.h" @@ -870,7 +871,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, assert(!DT || !UnrollVerifyDomtree || DT->verify(DominatorTree::VerificationLevel::Fast)); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); // Merge adjacent basic blocks, if possible. for (BasicBlock *Latch : Latches) { BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator()); @@ -890,6 +891,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, } } } + // Apply updates to the DomTree. + DT = &DTU.getDomTree(); // At this point, the code is well formed. We now simplify the unrolled loop, // doing constant propagation and dead code elimination as we go. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp index ff49d83f25c5..f1965934b2d7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp @@ -21,7 +21,6 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" @@ -177,6 +176,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop( // When we enter here we should have already checked that it is safe BasicBlock *Header = L->getHeader(); + assert(Header && "No header."); assert(L->getSubLoops().size() == 1); Loop *SubLoop = *L->begin(); @@ -247,8 +247,9 @@ LoopUnrollResult llvm::UnrollAndJamLoop( BasicBlock *Preheader = L->getLoopPreheader(); BasicBlock *LatchBlock = L->getLoopLatch(); + assert(Preheader && "No preheader"); + assert(LatchBlock && "No latch block"); BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator()); - assert(Preheader && LatchBlock && Header); assert(BI && !BI->isUnconditional()); bool ContinueOnTrue = L->contains(BI->getSuccessor(0)); BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue); @@ -517,6 +518,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop( movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]); } + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the // new ones required. if (Count != 1) { @@ -530,7 +532,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop( ForeBlocksLast.back(), SubLoopBlocksFirst[0]); DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert, SubLoopBlocksLast.back(), AftBlocksFirst[0]); - DT->applyUpdates(DTUpdates); + DTU.applyUpdatesPermissive(DTUpdates); } // Merge adjacent basic blocks, if possible. @@ -538,7 +540,6 @@ LoopUnrollResult llvm::UnrollAndJamLoop( MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end()); MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end()); MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end()); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); while (!MergeBlocks.empty()) { BasicBlock *BB = *MergeBlocks.begin(); BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()); @@ -555,6 +556,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop( } else MergeBlocks.erase(BB); } + // Apply updates to the DomTree. + DT = &DTU.getDomTree(); // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp index 005306cf1898..7a168ff6f32b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp @@ -62,9 +62,11 @@ static cl::opt<unsigned> UnrollForcePeelCount( cl::desc("Force a peel count regardless of profiling information.")); static cl::opt<bool> UnrollPeelMultiDeoptExit( - "unroll-peel-multi-deopt-exit", cl::init(false), cl::Hidden, + "unroll-peel-multi-deopt-exit", cl::init(true), cl::Hidden, cl::desc("Allow peeling of loops with multiple deopt exits.")); +static const char *PeeledCountMetaData = "llvm.loop.peeled.count"; + // Designates that a Phi is estimated to become invariant after an "infinite" // number of loop iterations (i.e. only may become an invariant if the loop is // fully unrolled). @@ -210,14 +212,11 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, const SCEVAddRecExpr *LeftAR = cast<SCEVAddRecExpr>(LeftSCEV); // Avoid huge SCEV computations in the loop below, make sure we only - // consider AddRecs of the loop we are trying to peel and avoid - // non-monotonic predicates, as we will not be able to simplify the loop - // body. - // FIXME: For the non-monotonic predicates ICMP_EQ and ICMP_NE we can - // simplify the loop, if we peel 1 additional iteration, if there - // is no wrapping. + // consider AddRecs of the loop we are trying to peel. + if (!LeftAR->isAffine() || LeftAR->getLoop() != &L) + continue; bool Increasing; - if (!LeftAR->isAffine() || LeftAR->getLoop() != &L || + if (!(ICmpInst::isEquality(Pred) && LeftAR->hasNoSelfWrap()) && !SE.isMonotonicPredicate(LeftAR, Pred, Increasing)) continue; (void)Increasing; @@ -236,18 +235,43 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, Pred = ICmpInst::getInversePredicate(Pred); const SCEV *Step = LeftAR->getStepRecurrence(SE); - while (NewPeelCount < MaxPeelCount && - SE.isKnownPredicate(Pred, IterVal, RightSCEV)) { - IterVal = SE.getAddExpr(IterVal, Step); + const SCEV *NextIterVal = SE.getAddExpr(IterVal, Step); + auto PeelOneMoreIteration = [&IterVal, &NextIterVal, &SE, Step, + &NewPeelCount]() { + IterVal = NextIterVal; + NextIterVal = SE.getAddExpr(IterVal, Step); NewPeelCount++; + }; + + auto CanPeelOneMoreIteration = [&NewPeelCount, &MaxPeelCount]() { + return NewPeelCount < MaxPeelCount; + }; + + while (CanPeelOneMoreIteration() && + SE.isKnownPredicate(Pred, IterVal, RightSCEV)) + PeelOneMoreIteration(); + + // With *that* peel count, does the predicate !Pred become known in the + // first iteration of the loop body after peeling? + if (!SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal, + RightSCEV)) + continue; // If not, give up. + + // However, for equality comparisons, that isn't always sufficient to + // eliminate the comparsion in loop body, we may need to peel one more + // iteration. See if that makes !Pred become unknown again. + if (ICmpInst::isEquality(Pred) && + !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal, + RightSCEV)) { + assert(!SE.isKnownPredicate(Pred, IterVal, RightSCEV) && + SE.isKnownPredicate(Pred, NextIterVal, RightSCEV) && + "Expected Pred to go from known to unknown."); + if (!CanPeelOneMoreIteration()) + continue; // Need to peel one more iteration, but can't. Give up. + PeelOneMoreIteration(); // Great! } - // Only peel the loop if the monotonic predicate !Pred becomes known in the - // first iteration of the loop body after peeling. - if (NewPeelCount > DesiredPeelCount && - SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), IterVal, - RightSCEV)) - DesiredPeelCount = NewPeelCount; + DesiredPeelCount = std::max(DesiredPeelCount, NewPeelCount); } return DesiredPeelCount; @@ -275,6 +299,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount << " iterations.\n"); UP.PeelCount = UnrollForcePeelCount; + UP.PeelProfiledIterations = true; return; } @@ -282,6 +307,13 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, if (!UP.AllowPeeling) return; + unsigned AlreadyPeeled = 0; + if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData)) + AlreadyPeeled = *Peeled; + // Stop if we already peeled off the maximum number of iterations. + if (AlreadyPeeled >= UnrollPeelMaxCount) + return; + // Here we try to get rid of Phis which become invariants after 1, 2, ..., N // iterations of the loop. For this we compute the number for iterations after // which every Phi is guaranteed to become an invariant, and try to peel the @@ -317,11 +349,14 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, DesiredPeelCount = std::min(DesiredPeelCount, MaxPeelCount); // Consider max peel count limitation. assert(DesiredPeelCount > 0 && "Wrong loop size estimation?"); - LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount - << " iteration(s) to turn" - << " some Phis into invariants.\n"); - UP.PeelCount = DesiredPeelCount; - return; + if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) { + LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount + << " iteration(s) to turn" + << " some Phis into invariants.\n"); + UP.PeelCount = DesiredPeelCount; + UP.PeelProfiledIterations = false; + return; + } } } @@ -330,6 +365,9 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, if (TripCount) return; + // Do not apply profile base peeling if it is disabled. + if (!UP.PeelProfiledIterations) + return; // If we don't know the trip count, but have reason to believe the average // trip count is low, peeling should be beneficial, since we will usually // hit the peeled section. @@ -344,7 +382,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, << "\n"); if (*PeelCount) { - if ((*PeelCount <= UnrollPeelMaxCount) && + if ((*PeelCount + AlreadyPeeled <= UnrollPeelMaxCount) && (LoopSize * (*PeelCount + 1) <= UP.Threshold)) { LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount << " iterations.\n"); @@ -352,6 +390,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, return; } LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n"); + LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n"); LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n"); LLVM_DEBUG(dbgs() << "Peel cost: " << LoopSize * (*PeelCount + 1) << "\n"); @@ -364,88 +403,77 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize, /// iteration. /// This sets the branch weights for the latch of the recently peeled off loop /// iteration correctly. -/// Our goal is to make sure that: -/// a) The total weight of all the copies of the loop body is preserved. -/// b) The total weight of the loop exit is preserved. -/// c) The body weight is reasonably distributed between the peeled iterations. +/// Let F is a weight of the edge from latch to header. +/// Let E is a weight of the edge from latch to exit. +/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to +/// go to exit. +/// Then, Estimated TripCount = F / E. +/// For I-th (counting from 0) peeled off iteration we set the the weights for +/// the peeled latch as (TC - I, 1). It gives us reasonable distribution, +/// The probability to go to exit 1/(TC-I) increases. At the same time +/// the estimated trip count of remaining loop reduces by I. +/// To avoid dealing with division rounding we can just multiple both part +/// of weights to E and use weight as (F - I * E, E). /// /// \param Header The copy of the header block that belongs to next iteration. /// \param LatchBR The copy of the latch branch that belongs to this iteration. -/// \param IterNumber The serial number of the iteration that was just -/// peeled off. -/// \param AvgIters The average number of iterations we expect the loop to have. -/// \param[in,out] PeeledHeaderWeight The total number of dynamic loop -/// iterations that are unaccounted for. As an input, it represents the number -/// of times we expect to enter the header of the iteration currently being -/// peeled off. The output is the number of times we expect to enter the -/// header of the next iteration. +/// \param[in,out] FallThroughWeight The weight of the edge from latch to +/// header before peeling (in) and after peeled off one iteration (out). static void updateBranchWeights(BasicBlock *Header, BranchInst *LatchBR, - unsigned IterNumber, unsigned AvgIters, - uint64_t &PeeledHeaderWeight) { - if (!PeeledHeaderWeight) + uint64_t ExitWeight, + uint64_t &FallThroughWeight) { + // FallThroughWeight is 0 means that there is no branch weights on original + // latch block or estimated trip count is zero. + if (!FallThroughWeight) return; - // FIXME: Pick a more realistic distribution. - // Currently the proportion of weight we assign to the fall-through - // side of the branch drops linearly with the iteration number, and we use - // a 0.9 fudge factor to make the drop-off less sharp... - uint64_t FallThruWeight = - PeeledHeaderWeight * ((float)(AvgIters - IterNumber) / AvgIters * 0.9); - uint64_t ExitWeight = PeeledHeaderWeight - FallThruWeight; - PeeledHeaderWeight -= ExitWeight; unsigned HeaderIdx = (LatchBR->getSuccessor(0) == Header ? 0 : 1); MDBuilder MDB(LatchBR->getContext()); MDNode *WeightNode = - HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThruWeight) - : MDB.createBranchWeights(FallThruWeight, ExitWeight); + HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight) + : MDB.createBranchWeights(FallThroughWeight, ExitWeight); LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode); + FallThroughWeight = + FallThroughWeight > ExitWeight ? FallThroughWeight - ExitWeight : 1; } /// Initialize the weights. /// /// \param Header The header block. /// \param LatchBR The latch branch. -/// \param AvgIters The average number of iterations we expect the loop to have. -/// \param[out] ExitWeight The # of times the edge from Latch to Exit is taken. -/// \param[out] CurHeaderWeight The # of times the header is executed. +/// \param[out] ExitWeight The weight of the edge from Latch to Exit. +/// \param[out] FallThroughWeight The weight of the edge from Latch to Header. static void initBranchWeights(BasicBlock *Header, BranchInst *LatchBR, - unsigned AvgIters, uint64_t &ExitWeight, - uint64_t &CurHeaderWeight) { + uint64_t &ExitWeight, + uint64_t &FallThroughWeight) { uint64_t TrueWeight, FalseWeight; if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) return; unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1; ExitWeight = HeaderIdx ? TrueWeight : FalseWeight; - // The # of times the loop body executes is the sum of the exit block - // is taken and the # of times the backedges are taken. - CurHeaderWeight = TrueWeight + FalseWeight; + FallThroughWeight = HeaderIdx ? FalseWeight : TrueWeight; } /// Update the weights of original Latch block after peeling off all iterations. /// /// \param Header The header block. /// \param LatchBR The latch branch. -/// \param ExitWeight The weight of the edge from Latch to Exit block. -/// \param CurHeaderWeight The # of time the header is executed. +/// \param ExitWeight The weight of the edge from Latch to Exit. +/// \param FallThroughWeight The weight of the edge from Latch to Header. static void fixupBranchWeights(BasicBlock *Header, BranchInst *LatchBR, - uint64_t ExitWeight, uint64_t CurHeaderWeight) { - // Adjust the branch weights on the loop exit. - if (!ExitWeight) + uint64_t ExitWeight, + uint64_t FallThroughWeight) { + // FallThroughWeight is 0 means that there is no branch weights on original + // latch block or estimated trip count is zero. + if (!FallThroughWeight) return; - // The backedge count is the difference of current header weight and - // current loop exit weight. If the current header weight is smaller than - // the current loop exit weight, we mark the loop backedge weight as 1. - uint64_t BackEdgeWeight = 0; - if (ExitWeight < CurHeaderWeight) - BackEdgeWeight = CurHeaderWeight - ExitWeight; - else - BackEdgeWeight = 1; + // Sets the branch weights on the loop exit. MDBuilder MDB(LatchBR->getContext()); unsigned HeaderIdx = LatchBR->getSuccessor(0) == Header ? 0 : 1; MDNode *WeightNode = - HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight) - : MDB.createBranchWeights(BackEdgeWeight, ExitWeight); + HeaderIdx ? MDB.createBranchWeights(ExitWeight, FallThroughWeight) + : MDB.createBranchWeights(FallThroughWeight, ExitWeight); LatchBR->setMetadata(LLVMContext::MD_prof, WeightNode); } @@ -556,7 +584,7 @@ static void cloneLoopBlocks( // LastValueMap is updated with the values for the current loop // which are used the next time this function is called. - for (const auto &KV : VMap) + for (auto KV : VMap) LVMap[KV.first] = KV.second; } @@ -586,11 +614,30 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, DenseMap<BasicBlock *, BasicBlock *> ExitIDom; if (DT) { + // We'd like to determine the idom of exit block after peeling one + // iteration. + // Let Exit is exit block. + // Let ExitingSet - is a set of predecessors of Exit block. They are exiting + // blocks. + // Let Latch' and ExitingSet' are copies after a peeling. + // We'd like to find an idom'(Exit) - idom of Exit after peeling. + // It is an evident that idom'(Exit) will be the nearest common dominator + // of ExitingSet and ExitingSet'. + // idom(Exit) is a nearest common dominator of ExitingSet. + // idom(Exit)' is a nearest common dominator of ExitingSet'. + // Taking into account that we have a single Latch, Latch' will dominate + // Header and idom(Exit). + // So the idom'(Exit) is nearest common dominator of idom(Exit)' and Latch'. + // All these basic blocks are in the same loop, so what we find is + // (nearest common dominator of idom(Exit) and Latch)'. + // In the loop below we remember nearest common dominator of idom(Exit) and + // Latch to update idom of Exit later. assert(L->hasDedicatedExits() && "No dedicated exits?"); for (auto Edge : ExitEdges) { if (ExitIDom.count(Edge.second)) continue; - BasicBlock *BB = DT->getNode(Edge.second)->getIDom()->getBlock(); + BasicBlock *BB = DT->findNearestCommonDominator( + DT->getNode(Edge.second)->getIDom()->getBlock(), Latch); assert(L->contains(BB) && "IDom is not in a loop"); ExitIDom[Edge.second] = BB; } @@ -659,23 +706,14 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, // newly created branches. BranchInst *LatchBR = cast<BranchInst>(cast<BasicBlock>(Latch)->getTerminator()); - uint64_t ExitWeight = 0, CurHeaderWeight = 0; - initBranchWeights(Header, LatchBR, PeelCount, ExitWeight, CurHeaderWeight); + uint64_t ExitWeight = 0, FallThroughWeight = 0; + initBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight); // For each peeled-off iteration, make a copy of the loop. for (unsigned Iter = 0; Iter < PeelCount; ++Iter) { SmallVector<BasicBlock *, 8> NewBlocks; ValueToValueMapTy VMap; - // Subtract the exit weight from the current header weight -- the exit - // weight is exactly the weight of the previous iteration's header. - // FIXME: due to the way the distribution is constructed, we need a - // guard here to make sure we don't end up with non-positive weights. - if (ExitWeight < CurHeaderWeight) - CurHeaderWeight -= ExitWeight; - else - CurHeaderWeight = 1; - cloneLoopBlocks(L, Iter, InsertTop, InsertBot, ExitEdges, NewBlocks, LoopBlocks, VMap, LVMap, DT, LI); @@ -697,8 +735,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, } auto *LatchBRCopy = cast<BranchInst>(VMap[LatchBR]); - updateBranchWeights(InsertBot, LatchBRCopy, Iter, - PeelCount, ExitWeight); + updateBranchWeights(InsertBot, LatchBRCopy, ExitWeight, FallThroughWeight); // Remove Loop metadata from the latch branch instruction // because it is not the Loop's latch branch anymore. LatchBRCopy->setMetadata(LLVMContext::MD_loop, nullptr); @@ -724,7 +761,13 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, PHI->setIncomingValueForBlock(NewPreHeader, NewVal); } - fixupBranchWeights(Header, LatchBR, ExitWeight, CurHeaderWeight); + fixupBranchWeights(Header, LatchBR, ExitWeight, FallThroughWeight); + + // Update Metadata for count of peeled off iterations. + unsigned AlreadyPeeled = 0; + if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData)) + AlreadyPeeled = *Peeled; + addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount); if (Loop *ParentLoop = L->getParentLoop()) L = ParentLoop; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index d22fdb4d52dc..ddb7479924bd 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils.h" @@ -395,9 +396,9 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop, } } if (CreateRemainderLoop) { - Loop *NewLoop = NewLoops[L]; - MDNode *LoopID = NewLoop->getLoopID(); + Loop *NewLoop = NewLoops[L]; assert(NewLoop && "L should have been cloned"); + MDNode *LoopID = NewLoop->getLoopID(); // Only add loop metadata if the loop is not going to be completely // unrolled. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp index ec226e65f650..c4c40189fda4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -19,11 +19,11 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -34,6 +34,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" @@ -45,6 +46,7 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "loop-utils" static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced"; +static const char *LLVMLoopDisableLICM = "llvm.licm.disable"; bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, @@ -169,6 +171,8 @@ void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) { AU.addPreserved<SCEVAAWrapperPass>(); AU.addRequired<ScalarEvolutionWrapperPass>(); AU.addPreserved<ScalarEvolutionWrapperPass>(); + // FIXME: When all loop passes preserve MemorySSA, it can be required and + // preserved here instead of the individual handling in each pass. } /// Manually defined generic "LoopPass" dependency initialization. This is used @@ -189,6 +193,54 @@ void llvm::initializeLoopPassPass(PassRegistry &Registry) { INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) + INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +} + +/// Create MDNode for input string. +static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = { + MDString::get(Context, Name), + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); +} + +/// Set input string into loop metadata by keeping other values intact. +/// If the string is already in loop metadata update value if it is +/// different. +void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD, + unsigned V) { + SmallVector<Metadata *, 4> MDs(1); + // If the loop already has metadata, retain it. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast<MDNode>(LoopID->getOperand(i)); + // If it is of form key = value, try to parse it. + if (Node->getNumOperands() == 2) { + MDString *S = dyn_cast<MDString>(Node->getOperand(0)); + if (S && S->getString().equals(StringMD)) { + ConstantInt *IntMD = + mdconst::extract_or_null<ConstantInt>(Node->getOperand(1)); + if (IntMD && IntMD->getSExtValue() == V) + // It is already in place. Do nothing. + return; + // We need to update the value, so just skip it here and it will + // be added after copying other existed nodes. + continue; + } + } + MDs.push_back(Node); + } + } + // Add new metadata. + MDs.push_back(createStringMetadata(TheLoop, StringMD, V)); + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + TheLoop->setLoopID(NewLoopID); } /// Find string metadata for loop @@ -332,6 +384,10 @@ bool llvm::hasDisableAllTransformsHint(const Loop *L) { return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced); } +bool llvm::hasDisableLICMTransformsHint(const Loop *L) { + return getBooleanLoopAttribute(L, LLVMLoopDisableLICM); +} + TransformationMode llvm::hasUnrollTransformation(Loop *L) { if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable")) return TM_SuppressedByUser; @@ -616,7 +672,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, LI->removeBlock(BB); // The last step is to update LoopInfo now that we've eliminated this loop. - LI->erase(L); + // Note: LoopInfo::erase remove the given loop and relink its subloops with + // its parent. While removeLoop/removeChildLoop remove the given loop but + // not relink its subloops, which is what we want. + if (Loop *ParentLoop = L->getParentLoop()) { + Loop::iterator I = find(ParentLoop->begin(), ParentLoop->end(), L); + assert(I != ParentLoop->end() && "Couldn't find loop"); + ParentLoop->removeChildLoop(I); + } else { + Loop::iterator I = find(LI->begin(), LI->end(), L); + assert(I != LI->end() && "Couldn't find loop"); + LI->removeLoop(I); + } + LI->destroy(L); } } @@ -646,19 +714,19 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) { // To estimate the number of times the loop body was executed, we want to // know the number of times the backedge was taken, vs. the number of times // we exited the loop. - uint64_t TrueVal, FalseVal; - if (!LatchBR->extractProfMetadata(TrueVal, FalseVal)) + uint64_t BackedgeTakenWeight, LatchExitWeight; + if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight)) return None; - if (!TrueVal || !FalseVal) + if (LatchBR->getSuccessor(0) != L->getHeader()) + std::swap(BackedgeTakenWeight, LatchExitWeight); + + if (!BackedgeTakenWeight || !LatchExitWeight) return 0; // Divide the count of the backedge by the count of the edge exiting the loop, // rounding to nearest. - if (LatchBR->getSuccessor(0) == L->getHeader()) - return (TrueVal + (FalseVal / 2)) / FalseVal; - else - return (FalseVal + (TrueVal / 2)) / TrueVal; + return llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight); } bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop, diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp index a9a480a4b7f9..50752bd78a65 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -18,6 +18,8 @@ #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -92,8 +94,8 @@ void LoopVersioning::versionLoop( // Create empty preheader for the loop (and after cloning for the // non-versioned loop). BasicBlock *PH = - SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI); - PH->setName(VersionedLoop->getHeader()->getName() + ".ph"); + SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI, + nullptr, VersionedLoop->getHeader()->getName() + ".ph"); // Clone the loop including the preheader. // diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp index fe67e191dc62..1af0ce3d86cc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerInvoke.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp index 8256e3b5f5af..4b9d0dadfc17 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp index cd2c81b6abc8..5ad7aeb463ec 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -19,6 +19,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Utils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp index c0b7edc547fd..7f961dbaf4b4 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/MetaRenamer.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/TypeFinder.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" @@ -121,15 +122,14 @@ namespace { } // Rename all functions - const TargetLibraryInfo &TLI = - getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); for (auto &F : M) { StringRef Name = F.getName(); LibFunc Tmp; // Leave library functions alone because their presence or absence could // affect the behavior of other passes. if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) || - TLI.getLibFunc(F, Tmp)) + getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F).getLibFunc( + F, Tmp)) continue; // Leave @main alone. The output of -metarenamer might be passed to diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp new file mode 100644 index 000000000000..a16ca1fb8efa --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/MisExpect.cpp @@ -0,0 +1,178 @@ +//===--- MisExpect.cpp - Check the use of llvm.expect with PGO data -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to emit warnings for potentially incorrect usage of the +// llvm.expect intrinsic. This utility extracts the threshold values from +// metadata associated with the instrumented Branch or Switch instruction. The +// threshold values are then used to determine if a warning should be emmited. +// +// MisExpect metadata is generated when llvm.expect intrinsics are lowered see +// LowerExpectIntrinsic.cpp +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/MisExpect.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/BranchProbability.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FormatVariadic.h" +#include <cstdint> +#include <functional> +#include <numeric> + +#define DEBUG_TYPE "misexpect" + +using namespace llvm; +using namespace misexpect; + +namespace llvm { + +// Command line option to enable/disable the warning when profile data suggests +// a mismatch with the use of the llvm.expect intrinsic +static cl::opt<bool> PGOWarnMisExpect( + "pgo-warn-misexpect", cl::init(false), cl::Hidden, + cl::desc("Use this option to turn on/off " + "warnings about incorrect usage of llvm.expect intrinsics.")); + +} // namespace llvm + +namespace { + +Instruction *getOprndOrInst(Instruction *I) { + assert(I != nullptr && "MisExpect target Instruction cannot be nullptr"); + Instruction *Ret = nullptr; + if (auto *B = dyn_cast<BranchInst>(I)) { + Ret = dyn_cast<Instruction>(B->getCondition()); + } + // TODO: Find a way to resolve condition location for switches + // Using the condition of the switch seems to often resolve to an earlier + // point in the program, i.e. the calculation of the switch condition, rather + // than the switches location in the source code. Thus, we should use the + // instruction to get source code locations rather than the condition to + // improve diagnostic output, such as the caret. If the same problem exists + // for branch instructions, then we should remove this function and directly + // use the instruction + // + // else if (auto S = dyn_cast<SwitchInst>(I)) { + // Ret = I; + //} + return Ret ? Ret : I; +} + +void emitMisexpectDiagnostic(Instruction *I, LLVMContext &Ctx, + uint64_t ProfCount, uint64_t TotalCount) { + double PercentageCorrect = (double)ProfCount / TotalCount; + auto PerString = + formatv("{0:P} ({1} / {2})", PercentageCorrect, ProfCount, TotalCount); + auto RemStr = formatv( + "Potential performance regression from use of the llvm.expect intrinsic: " + "Annotation was correct on {0} of profiled executions.", + PerString); + Twine Msg(PerString); + Instruction *Cond = getOprndOrInst(I); + if (PGOWarnMisExpect) + Ctx.diagnose(DiagnosticInfoMisExpect(Cond, Msg)); + OptimizationRemarkEmitter ORE(I->getParent()->getParent()); + ORE.emit(OptimizationRemark(DEBUG_TYPE, "misexpect", Cond) << RemStr.str()); +} + +} // namespace + +namespace llvm { +namespace misexpect { + +void verifyMisExpect(Instruction *I, const SmallVector<uint32_t, 4> &Weights, + LLVMContext &Ctx) { + if (auto *MisExpectData = I->getMetadata(LLVMContext::MD_misexpect)) { + auto *MisExpectDataName = dyn_cast<MDString>(MisExpectData->getOperand(0)); + if (MisExpectDataName && + MisExpectDataName->getString().equals("misexpect")) { + LLVM_DEBUG(llvm::dbgs() << "------------------\n"); + LLVM_DEBUG(llvm::dbgs() + << "Function: " << I->getFunction()->getName() << "\n"); + LLVM_DEBUG(llvm::dbgs() << "Instruction: " << *I << ":\n"); + LLVM_DEBUG(for (int Idx = 0, Size = Weights.size(); Idx < Size; ++Idx) { + llvm::dbgs() << "Weights[" << Idx << "] = " << Weights[Idx] << "\n"; + }); + + // extract values from misexpect metadata + const auto *IndexCint = + mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(1)); + const auto *LikelyCInt = + mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(2)); + const auto *UnlikelyCInt = + mdconst::dyn_extract<ConstantInt>(MisExpectData->getOperand(3)); + + if (!IndexCint || !LikelyCInt || !UnlikelyCInt) + return; + + const uint64_t Index = IndexCint->getZExtValue(); + const uint64_t LikelyBranchWeight = LikelyCInt->getZExtValue(); + const uint64_t UnlikelyBranchWeight = UnlikelyCInt->getZExtValue(); + const uint64_t ProfileCount = Weights[Index]; + const uint64_t CaseTotal = std::accumulate( + Weights.begin(), Weights.end(), (uint64_t)0, std::plus<uint64_t>()); + const uint64_t NumUnlikelyTargets = Weights.size() - 1; + + const uint64_t TotalBranchWeight = + LikelyBranchWeight + (UnlikelyBranchWeight * NumUnlikelyTargets); + + const llvm::BranchProbability LikelyThreshold(LikelyBranchWeight, + TotalBranchWeight); + uint64_t ScaledThreshold = LikelyThreshold.scale(CaseTotal); + + LLVM_DEBUG(llvm::dbgs() + << "Unlikely Targets: " << NumUnlikelyTargets << ":\n"); + LLVM_DEBUG(llvm::dbgs() << "Profile Count: " << ProfileCount << ":\n"); + LLVM_DEBUG(llvm::dbgs() + << "Scaled Threshold: " << ScaledThreshold << ":\n"); + LLVM_DEBUG(llvm::dbgs() << "------------------\n"); + if (ProfileCount < ScaledThreshold) + emitMisexpectDiagnostic(I, Ctx, ProfileCount, CaseTotal); + } + } +} + +void checkFrontendInstrumentation(Instruction &I) { + if (auto *MD = I.getMetadata(LLVMContext::MD_prof)) { + unsigned NOps = MD->getNumOperands(); + + // Only emit misexpect diagnostics if at least 2 branch weights are present. + // Less than 2 branch weights means that the profiling metadata is: + // 1) incorrect/corrupted + // 2) not branch weight metadata + // 3) completely deterministic + // In these cases we should not emit any diagnostic related to misexpect. + if (NOps < 3) + return; + + // Operand 0 is a string tag "branch_weights" + if (MDString *Tag = cast<MDString>(MD->getOperand(0))) { + if (Tag->getString().equals("branch_weights")) { + SmallVector<uint32_t, 4> RealWeights(NOps - 1); + for (unsigned i = 1; i < NOps; i++) { + ConstantInt *Value = + mdconst::dyn_extract<ConstantInt>(MD->getOperand(i)); + RealWeights[i - 1] = Value->getZExtValue(); + } + verifyMisExpect(&I, RealWeights, I.getContext()); + } + } + } +} + +} // namespace misexpect +} // namespace llvm +#undef DEBUG_TYPE diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp index c84beceee191..b94f57e4dc2c 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" @@ -73,7 +74,7 @@ static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *> SmallPtrSet<Constant *, 16> InitAsSet; SmallVector<Constant *, 16> Init; if (GV) { - ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer()); + auto *CA = cast<ConstantArray>(GV->getInitializer()); for (auto &Op : CA->operands()) { Constant *C = cast_or_null<Constant>(Op); if (InitAsSet.insert(C).second) @@ -280,3 +281,31 @@ std::string llvm::getUniqueModuleId(Module *M) { MD5::stringifyResult(R, Str); return ("$" + Str).str(); } + +void VFABI::setVectorVariantNames( + CallInst *CI, const SmallVector<std::string, 8> &VariantMappings) { + if (VariantMappings.empty()) + return; + + SmallString<256> Buffer; + llvm::raw_svector_ostream Out(Buffer); + for (const std::string &VariantMapping : VariantMappings) + Out << VariantMapping << ","; + // Get rid of the trailing ','. + assert(!Buffer.str().empty() && "Must have at least one char."); + Buffer.pop_back(); + + Module *M = CI->getModule(); +#ifndef NDEBUG + for (const std::string &VariantMapping : VariantMappings) { + Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping); + assert(VI.hasValue() && "Canno add an invalid VFABI name."); + assert(M->getNamedValue(VI.getValue().VectorName) && + "Cannot add variant to attribute: " + "vector function declaration is missing."); + } +#endif + CI->addAttribute( + AttributeList::FunctionIndex, + Attribute::get(M->getContext(), MappingsAttrName, Buffer.str())); +} diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp index ac8991e9d475..1c5c41abc682 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp @@ -12,9 +12,9 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/NameAnonGlobals.h" - #include "llvm/ADT/SmallString.h" #include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/MD5.h" #include "llvm/Transforms/Utils/ModuleUtils.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp index bdf24d80bd17..dda2867f44b2 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -30,6 +30,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Support/FormattedStream.h" @@ -125,8 +126,10 @@ static bool valueComesBefore(OrderedInstructions &OI, const Value *A, // necessary to compare uses/defs in the same block. Doing so allows us to walk // the minimum number of instructions necessary to compute our def/use ordering. struct ValueDFS_Compare { + DominatorTree &DT; OrderedInstructions &OI; - ValueDFS_Compare(OrderedInstructions &OI) : OI(OI) {} + ValueDFS_Compare(DominatorTree &DT, OrderedInstructions &OI) + : DT(DT), OI(OI) {} bool operator()(const ValueDFS &A, const ValueDFS &B) const { if (&A == &B) @@ -136,7 +139,9 @@ struct ValueDFS_Compare { // comesbefore to see what the real ordering is, because they are in the // same basic block. - bool SameBlock = std::tie(A.DFSIn, A.DFSOut) == std::tie(B.DFSIn, B.DFSOut); + assert((A.DFSIn != B.DFSIn || A.DFSOut == B.DFSOut) && + "Equal DFS-in numbers imply equal out numbers"); + bool SameBlock = A.DFSIn == B.DFSIn; // We want to put the def that will get used for a given set of phi uses, // before those phi uses. @@ -145,9 +150,11 @@ struct ValueDFS_Compare { if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last) return comparePHIRelated(A, B); + bool isADef = A.Def; + bool isBDef = B.Def; if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle) - return std::tie(A.DFSIn, A.DFSOut, A.LocalNum, A.Def, A.U) < - std::tie(B.DFSIn, B.DFSOut, B.LocalNum, B.Def, B.U); + return std::tie(A.DFSIn, A.LocalNum, isADef) < + std::tie(B.DFSIn, B.LocalNum, isBDef); return localComesBefore(A, B); } @@ -164,10 +171,35 @@ struct ValueDFS_Compare { // For two phi related values, return the ordering. bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const { - auto &ABlockEdge = getBlockEdge(A); - auto &BBlockEdge = getBlockEdge(B); - // Now sort by block edge and then defs before uses. - return std::tie(ABlockEdge, A.Def, A.U) < std::tie(BBlockEdge, B.Def, B.U); + BasicBlock *ASrc, *ADest, *BSrc, *BDest; + std::tie(ASrc, ADest) = getBlockEdge(A); + std::tie(BSrc, BDest) = getBlockEdge(B); + +#ifndef NDEBUG + // This function should only be used for values in the same BB, check that. + DomTreeNode *DomASrc = DT.getNode(ASrc); + DomTreeNode *DomBSrc = DT.getNode(BSrc); + assert(DomASrc->getDFSNumIn() == (unsigned)A.DFSIn && + "DFS numbers for A should match the ones of the source block"); + assert(DomBSrc->getDFSNumIn() == (unsigned)B.DFSIn && + "DFS numbers for B should match the ones of the source block"); + assert(A.DFSIn == B.DFSIn && "Values must be in the same block"); +#endif + (void)ASrc; + (void)BSrc; + + // Use DFS numbers to compare destination blocks, to guarantee a + // deterministic order. + DomTreeNode *DomADest = DT.getNode(ADest); + DomTreeNode *DomBDest = DT.getNode(BDest); + unsigned AIn = DomADest->getDFSNumIn(); + unsigned BIn = DomBDest->getDFSNumIn(); + bool isADef = A.Def; + bool isBDef = B.Def; + assert((!A.Def || !A.U) && (!B.Def || !B.U) && + "Def and U cannot be set at the same time"); + // Now sort by edge destination and then defs before uses. + return std::tie(AIn, isADef) < std::tie(BIn, isBDef); } // Get the definition of an instruction that occurs in the middle of a block. @@ -306,10 +338,11 @@ void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) { } // Add Op, PB to the list of value infos for Op, and mark Op to be renamed. -void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op, +void PredicateInfo::addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op, PredicateBase *PB) { - OpsToRename.insert(Op); auto &OperandInfo = getOrCreateValueInfo(Op); + if (OperandInfo.Infos.empty()) + OpsToRename.push_back(Op); AllInfos.push_back(PB); OperandInfo.Infos.push_back(PB); } @@ -317,7 +350,7 @@ void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op, // Process an assume instruction and place relevant operations we want to rename // into OpsToRename. void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB, - SmallPtrSetImpl<Value *> &OpsToRename) { + SmallVectorImpl<Value *> &OpsToRename) { // See if we have a comparison we support SmallVector<Value *, 8> CmpOperands; SmallVector<Value *, 2> ConditionsToProcess; @@ -357,7 +390,7 @@ void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB, // Process a block terminating branch, and place relevant operations to be // renamed into OpsToRename. void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB, - SmallPtrSetImpl<Value *> &OpsToRename) { + SmallVectorImpl<Value *> &OpsToRename) { BasicBlock *FirstBB = BI->getSuccessor(0); BasicBlock *SecondBB = BI->getSuccessor(1); SmallVector<BasicBlock *, 2> SuccsToProcess; @@ -427,7 +460,7 @@ void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB, // Process a block terminating switch, and place relevant operations to be // renamed into OpsToRename. void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB, - SmallPtrSetImpl<Value *> &OpsToRename) { + SmallVectorImpl<Value *> &OpsToRename) { Value *Op = SI->getCondition(); if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse()) return; @@ -457,7 +490,7 @@ void PredicateInfo::buildPredicateInfo() { DT.updateDFSNumbers(); // Collect operands to rename from all conditional branch terminators, as well // as assume statements. - SmallPtrSet<Value *, 8> OpsToRename; + SmallVector<Value *, 8> OpsToRename; for (auto DTN : depth_first(DT.getRootNode())) { BasicBlock *BranchBB = DTN->getBlock(); if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) { @@ -524,7 +557,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, if (isa<PredicateWithEdge>(ValInfo)) { IRBuilder<> B(getBranchTerminator(ValInfo)); Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); - if (empty(IF->users())) + if (IF->users().empty()) CreatedDeclarations.insert(IF); CallInst *PIC = B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++)); @@ -536,7 +569,7 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, "Should not have gotten here without it being an assume"); IRBuilder<> B(PAssume->AssumeInst); Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); - if (empty(IF->users())) + if (IF->users().empty()) CreatedDeclarations.insert(IF); CallInst *PIC = B.CreateCall(IF, Op); PredicateMap.insert({PIC, ValInfo}); @@ -565,14 +598,8 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter, // // TODO: Use this algorithm to perform fast single-variable renaming in // promotememtoreg and memoryssa. -void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpSet) { - // Sort OpsToRename since we are going to iterate it. - SmallVector<Value *, 8> OpsToRename(OpSet.begin(), OpSet.end()); - auto Comparator = [&](const Value *A, const Value *B) { - return valueComesBefore(OI, A, B); - }; - llvm::sort(OpsToRename, Comparator); - ValueDFS_Compare Compare(OI); +void PredicateInfo::renameUses(SmallVectorImpl<Value *> &OpsToRename) { + ValueDFS_Compare Compare(DT, OI); // Compute liveness, and rename in O(uses) per Op. for (auto *Op : OpsToRename) { LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n"); @@ -772,7 +799,7 @@ static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) { bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) { auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); - auto PredInfo = make_unique<PredicateInfo>(F, DT, AC); + auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC); PredInfo->print(dbgs()); if (VerifyPredicateInfo) PredInfo->verifyPredicateInfo(); @@ -786,7 +813,7 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F, auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); OS << "PredicateInfo for function: " << F.getName() << "\n"; - auto PredInfo = make_unique<PredicateInfo>(F, DT, AC); + auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC); PredInfo->print(OS); replaceCreatedSSACopys(*PredInfo, F); @@ -845,7 +872,7 @@ PreservedAnalyses PredicateInfoVerifierPass::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); - make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo(); + std::make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo(); return PreservedAnalyses::all(); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 6e2ef67408d9..0ea6e99e6f19 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/GuardUtils.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/MemorySSAUpdater.h" @@ -94,6 +95,12 @@ static cl::opt<unsigned> PHINodeFoldingThreshold( cl::desc( "Control the amount of phi node folding to perform (default = 2)")); +static cl::opt<unsigned> TwoEntryPHINodeFoldingThreshold( + "two-entry-phi-node-folding-threshold", cl::Hidden, cl::init(4), + cl::desc("Control the maximal total instruction cost that we are willing " + "to speculatively execute to fold a 2-entry PHI node into a " + "select (default = 4)")); + static cl::opt<bool> DupRet( "simplifycfg-dup-ret", cl::Hidden, cl::init(false), cl::desc("Duplicate return instructions into unconditional branches")); @@ -332,7 +339,7 @@ static unsigned ComputeSpeculationCost(const User *I, /// CostRemaining, false is returned and CostRemaining is undefined. static bool DominatesMergePoint(Value *V, BasicBlock *BB, SmallPtrSetImpl<Instruction *> &AggressiveInsts, - unsigned &CostRemaining, + int &BudgetRemaining, const TargetTransformInfo &TTI, unsigned Depth = 0) { // It is possible to hit a zero-cost cycle (phi/gep instructions for example), @@ -375,7 +382,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, if (!isSafeToSpeculativelyExecute(I)) return false; - unsigned Cost = ComputeSpeculationCost(I, TTI); + BudgetRemaining -= ComputeSpeculationCost(I, TTI); // Allow exactly one instruction to be speculated regardless of its cost // (as long as it is safe to do so). @@ -383,17 +390,14 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB, // or other expensive operation. The speculation of an expensive instruction // is expected to be undone in CodeGenPrepare if the speculation has not // enabled further IR optimizations. - if (Cost > CostRemaining && + if (BudgetRemaining < 0 && (!SpeculateOneExpensiveInst || !AggressiveInsts.empty() || Depth > 0)) return false; - // Avoid unsigned wrap. - CostRemaining = (Cost > CostRemaining) ? 0 : CostRemaining - Cost; - // Okay, we can only really hoist these out if their operands do // not take us over the cost threshold. for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) - if (!DominatesMergePoint(*i, BB, AggressiveInsts, CostRemaining, TTI, + if (!DominatesMergePoint(*i, BB, AggressiveInsts, BudgetRemaining, TTI, Depth + 1)) return false; // Okay, it's safe to do this! Remember this instruction. @@ -629,8 +633,7 @@ private: /// vector. /// One "Extra" case is allowed to differ from the other. void gather(Value *V) { - Instruction *I = dyn_cast<Instruction>(V); - bool isEQ = (I->getOpcode() == Instruction::Or); + bool isEQ = (cast<Instruction>(V)->getOpcode() == Instruction::Or); // Keep a stack (SmallVector for efficiency) for depth-first traversal SmallVector<Value *, 8> DFT; @@ -1313,7 +1316,8 @@ static bool HoistThenElseCodeToIf(BranchInst *BI, LLVMContext::MD_dereferenceable, LLVMContext::MD_dereferenceable_or_null, LLVMContext::MD_mem_parallel_loop_access, - LLVMContext::MD_access_group}; + LLVMContext::MD_access_group, + LLVMContext::MD_preserve_access_index}; combineMetadata(I1, I2, KnownIDs, true); // I1 and I2 are being combined into a single instruction. Its debug @@ -1400,10 +1404,16 @@ HoistTerminator: // These values do not agree. Insert a select instruction before NT // that determines the right value. SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)]; - if (!SI) + if (!SI) { + // Propagate fast-math-flags from phi node to its replacement select. + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); + if (isa<FPMathOperator>(PN)) + Builder.setFastMathFlags(PN.getFastMathFlags()); + SI = cast<SelectInst>( Builder.CreateSelect(BI->getCondition(), BB1V, BB2V, BB1V->getName() + "." + BB2V->getName(), BI)); + } // Make the PHI node use the select for all incoming values for BB1/BB2 for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) @@ -1420,6 +1430,20 @@ HoistTerminator: return true; } +// Check lifetime markers. +static bool isLifeTimeMarker(const Instruction *I) { + if (auto II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + return true; + } + } + return false; +} + // All instructions in Insts belong to different blocks that all unconditionally // branch to a common successor. Analyze each instruction and return true if it // would be possible to sink them into their successor, creating one common @@ -1472,20 +1496,25 @@ static bool canSinkInstructions( return false; } - // Because SROA can't handle speculating stores of selects, try not - // to sink loads or stores of allocas when we'd have to create a PHI for - // the address operand. Also, because it is likely that loads or stores - // of allocas will disappear when Mem2Reg/SROA is run, don't sink them. + // Because SROA can't handle speculating stores of selects, try not to sink + // loads, stores or lifetime markers of allocas when we'd have to create a + // PHI for the address operand. Also, because it is likely that loads or + // stores of allocas will disappear when Mem2Reg/SROA is run, don't sink + // them. // This can cause code churn which can have unintended consequences down // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244. // FIXME: This is a workaround for a deficiency in SROA - see // https://llvm.org/bugs/show_bug.cgi?id=30188 if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) { - return isa<AllocaInst>(I->getOperand(1)); + return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts()); })) return false; if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) { - return isa<AllocaInst>(I->getOperand(0)); + return isa<AllocaInst>(I->getOperand(0)->stripPointerCasts()); + })) + return false; + if (isLifeTimeMarker(I0) && any_of(Insts, [](const Instruction *I) { + return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts()); })) return false; @@ -1958,7 +1987,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics; - unsigned SpeculationCost = 0; + unsigned SpeculatedInstructions = 0; Value *SpeculatedStoreValue = nullptr; StoreInst *SpeculatedStore = nullptr; for (BasicBlock::iterator BBI = ThenBB->begin(), @@ -1973,8 +2002,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, // Only speculatively execute a single instruction (not counting the // terminator) for now. - ++SpeculationCost; - if (SpeculationCost > 1) + ++SpeculatedInstructions; + if (SpeculatedInstructions > 1) return false; // Don't hoist the instruction if it's unsafe or expensive. @@ -2011,8 +2040,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, E = SinkCandidateUseCounts.end(); I != E; ++I) if (I->first->hasNUses(I->second)) { - ++SpeculationCost; - if (SpeculationCost > 1) + ++SpeculatedInstructions; + if (SpeculatedInstructions > 1) return false; } @@ -2052,8 +2081,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB, // getting expanded into Instructions. // FIXME: This doesn't account for how many operations are combined in the // constant expression. - ++SpeculationCost; - if (SpeculationCost > 1) + ++SpeculatedInstructions; + if (SpeculatedInstructions > 1) return false; } @@ -2239,14 +2268,14 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL, if (!BBI->use_empty()) TranslateMap[&*BBI] = N; } - // Insert the new instruction into its new home. - if (N) + if (N) { + // Insert the new instruction into its new home. EdgeBB->getInstList().insert(InsertPt, N); - // Register the new instruction with the assumption cache if necessary. - if (auto *II = dyn_cast_or_null<IntrinsicInst>(N)) - if (II->getIntrinsicID() == Intrinsic::assume) - AC->registerAssumption(II); + // Register the new instruction with the assumption cache if necessary. + if (AC && match(N, m_Intrinsic<Intrinsic::assume>())) + AC->registerAssumption(cast<IntrinsicInst>(N)); + } } // Loop over all of the edges from PredBB to BB, changing them to branch @@ -2301,10 +2330,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, // instructions. While we are at it, keep track of the instructions // that need to be moved to the dominating block. SmallPtrSet<Instruction *, 4> AggressiveInsts; - unsigned MaxCostVal0 = PHINodeFoldingThreshold, - MaxCostVal1 = PHINodeFoldingThreshold; - MaxCostVal0 *= TargetTransformInfo::TCC_Basic; - MaxCostVal1 *= TargetTransformInfo::TCC_Basic; + int BudgetRemaining = + TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) { PHINode *PN = cast<PHINode>(II++); @@ -2315,9 +2342,9 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, } if (!DominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts, - MaxCostVal0, TTI) || + BudgetRemaining, TTI) || !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts, - MaxCostVal1, TTI)) + BudgetRemaining, TTI)) return false; } @@ -2327,12 +2354,24 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, if (!PN) return true; - // Don't fold i1 branches on PHIs which contain binary operators. These can - // often be turned into switches and other things. + // Return true if at least one of these is a 'not', and another is either + // a 'not' too, or a constant. + auto CanHoistNotFromBothValues = [](Value *V0, Value *V1) { + if (!match(V0, m_Not(m_Value()))) + std::swap(V0, V1); + auto Invertible = m_CombineOr(m_Not(m_Value()), m_AnyIntegralConstant()); + return match(V0, m_Not(m_Value())) && match(V1, Invertible); + }; + + // Don't fold i1 branches on PHIs which contain binary operators, unless one + // of the incoming values is an 'not' and another one is freely invertible. + // These can often be turned into switches and other things. if (PN->getType()->isIntegerTy(1) && (isa<BinaryOperator>(PN->getIncomingValue(0)) || isa<BinaryOperator>(PN->getIncomingValue(1)) || - isa<BinaryOperator>(IfCond))) + isa<BinaryOperator>(IfCond)) && + !CanHoistNotFromBothValues(PN->getIncomingValue(0), + PN->getIncomingValue(1))) return false; // If all PHI nodes are promotable, check to make sure that all instructions @@ -2367,6 +2406,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, return false; } } + assert(DomBlock && "Failed to find root DomBlock"); LLVM_DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond << " T: " << IfTrue->getName() @@ -2384,7 +2424,12 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI, if (IfBlock2) hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2); + // Propagate fast-math-flags from phi nodes to replacement selects. + IRBuilder<>::FastMathFlagGuard FMFGuard(Builder); while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) { + if (isa<FPMathOperator>(PN)) + Builder.setFastMathFlags(PN->getFastMathFlags()); + // Change the PHI node into a select instruction. Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse); Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue); @@ -2912,42 +2957,8 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, BasicBlock *QTB, BasicBlock *QFB, BasicBlock *PostBB, Value *Address, bool InvertPCond, bool InvertQCond, - const DataLayout &DL) { - auto IsaBitcastOfPointerType = [](const Instruction &I) { - return Operator::getOpcode(&I) == Instruction::BitCast && - I.getType()->isPointerTy(); - }; - - // If we're not in aggressive mode, we only optimize if we have some - // confidence that by optimizing we'll allow P and/or Q to be if-converted. - auto IsWorthwhile = [&](BasicBlock *BB) { - if (!BB) - return true; - // Heuristic: if the block can be if-converted/phi-folded and the - // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to - // thread this store. - unsigned N = 0; - for (auto &I : BB->instructionsWithoutDebug()) { - // Cheap instructions viable for folding. - if (isa<BinaryOperator>(I) || isa<GetElementPtrInst>(I) || - isa<StoreInst>(I)) - ++N; - // Free instructions. - else if (I.isTerminator() || IsaBitcastOfPointerType(I)) - continue; - else - return false; - } - // The store we want to merge is counted in N, so add 1 to make sure - // we're counting the instructions that would be left. - return N <= (PHINodeFoldingThreshold + 1); - }; - - if (!MergeCondStoresAggressively && - (!IsWorthwhile(PTB) || !IsWorthwhile(PFB) || !IsWorthwhile(QTB) || - !IsWorthwhile(QFB))) - return false; - + const DataLayout &DL, + const TargetTransformInfo &TTI) { // For every pointer, there must be exactly two stores, one coming from // PTB or PFB, and the other from QTB or QFB. We don't support more than one // store (to any address) in PTB,PFB or QTB,QFB. @@ -2988,6 +2999,46 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, if (&*I != PStore && I->mayReadOrWriteMemory()) return false; + // If we're not in aggressive mode, we only optimize if we have some + // confidence that by optimizing we'll allow P and/or Q to be if-converted. + auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) { + if (!BB) + return true; + // Heuristic: if the block can be if-converted/phi-folded and the + // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to + // thread this store. + int BudgetRemaining = + PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic; + for (auto &I : BB->instructionsWithoutDebug()) { + // Consider terminator instruction to be free. + if (I.isTerminator()) + continue; + // If this is one the stores that we want to speculate out of this BB, + // then don't count it's cost, consider it to be free. + if (auto *S = dyn_cast<StoreInst>(&I)) + if (llvm::find(FreeStores, S)) + continue; + // Else, we have a white-list of instructions that we are ak speculating. + if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I)) + return false; // Not in white-list - not worthwhile folding. + // And finally, if this is a non-free instruction that we are okay + // speculating, ensure that we consider the speculation budget. + BudgetRemaining -= TTI.getUserCost(&I); + if (BudgetRemaining < 0) + return false; // Eagerly refuse to fold as soon as we're out of budget. + } + assert(BudgetRemaining >= 0 && + "When we run out of budget we will eagerly return from within the " + "per-instruction loop."); + return true; + }; + + const SmallVector<StoreInst *, 2> FreeStores = {PStore, QStore}; + if (!MergeCondStoresAggressively && + (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) || + !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores))) + return false; + // If PostBB has more than two predecessors, we need to split it so we can // sink the store. if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) { @@ -3047,15 +3098,15 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, // store that doesn't execute. if (MinAlignment != 0) { // Choose the minimum of all non-zero alignments. - SI->setAlignment(MinAlignment); + SI->setAlignment(Align(MinAlignment)); } else if (MaxAlignment != 0) { // Choose the minimal alignment between the non-zero alignment and the ABI // default alignment for the type of the stored value. - SI->setAlignment(std::min(MaxAlignment, TypeAlignment)); + SI->setAlignment(Align(std::min(MaxAlignment, TypeAlignment))); } else { // If both alignments are zero, use ABI default alignment for the type of // the stored value. - SI->setAlignment(TypeAlignment); + SI->setAlignment(Align(TypeAlignment)); } QStore->eraseFromParent(); @@ -3065,7 +3116,8 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, } static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI, - const DataLayout &DL) { + const DataLayout &DL, + const TargetTransformInfo &TTI) { // The intention here is to find diamonds or triangles (see below) where each // conditional block contains a store to the same address. Both of these // stores are conditional, so they can't be unconditionally sunk. But it may @@ -3167,16 +3219,58 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI, bool Changed = false; for (auto *Address : CommonAddresses) Changed |= mergeConditionalStoreToAddress( - PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL); + PTB, PFB, QTB, QFB, PostBB, Address, InvertPCond, InvertQCond, DL, TTI); return Changed; } + +/// If the previous block ended with a widenable branch, determine if reusing +/// the target block is profitable and legal. This will have the effect of +/// "widening" PBI, but doesn't require us to reason about hosting safety. +static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) { + // TODO: This can be generalized in two important ways: + // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input + // values from the PBI edge. + // 2) We can sink side effecting instructions into BI's fallthrough + // successor provided they doesn't contribute to computation of + // BI's condition. + Value *CondWB, *WC; + BasicBlock *IfTrueBB, *IfFalseBB; + if (!parseWidenableBranch(PBI, CondWB, WC, IfTrueBB, IfFalseBB) || + IfTrueBB != BI->getParent() || !BI->getParent()->getSinglePredecessor()) + return false; + if (!IfFalseBB->phis().empty()) + return false; // TODO + // Use lambda to lazily compute expensive condition after cheap ones. + auto NoSideEffects = [](BasicBlock &BB) { + return !llvm::any_of(BB, [](const Instruction &I) { + return I.mayWriteToMemory() || I.mayHaveSideEffects(); + }); + }; + if (BI->getSuccessor(1) != IfFalseBB && // no inf looping + BI->getSuccessor(1)->getTerminatingDeoptimizeCall() && // profitability + NoSideEffects(*BI->getParent())) { + BI->getSuccessor(1)->removePredecessor(BI->getParent()); + BI->setSuccessor(1, IfFalseBB); + return true; + } + if (BI->getSuccessor(0) != IfFalseBB && // no inf looping + BI->getSuccessor(0)->getTerminatingDeoptimizeCall() && // profitability + NoSideEffects(*BI->getParent())) { + BI->getSuccessor(0)->removePredecessor(BI->getParent()); + BI->setSuccessor(0, IfFalseBB); + return true; + } + return false; +} + /// If we have a conditional branch as a predecessor of another block, /// this function tries to simplify it. We know /// that PBI and BI are both conditional branches, and BI is in one of the /// successor blocks of PBI - PBI branches to BI. static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, - const DataLayout &DL) { + const DataLayout &DL, + const TargetTransformInfo &TTI) { assert(PBI->isConditional() && BI->isConditional()); BasicBlock *BB = BI->getParent(); @@ -3225,6 +3319,12 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, } } + // If the previous block ended with a widenable branch, determine if reusing + // the target block is profitable and legal. This will have the effect of + // "widening" PBI, but doesn't require us to reason about hosting safety. + if (tryWidenCondBranchToCondBranch(PBI, BI)) + return true; + if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition())) if (CE->canTrap()) return false; @@ -3232,7 +3332,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI, // If both branches are conditional and both contain stores to the same // address, remove the stores from the conditionals and create a conditional // merged store at the end. - if (MergeCondStores && mergeConditionalStores(PBI, BI, DL)) + if (MergeCondStores && mergeConditionalStores(PBI, BI, DL, TTI)) return true; // If this is a conditional branch in an empty block, and if any @@ -3696,12 +3796,17 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder, BasicBlock *BB = BI->getParent(); + // MSAN does not like undefs as branch condition which can be introduced + // with "explicit branch". + if (ExtraCase && BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory)) + return false; + LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size() << " cases into SWITCH. BB is:\n" << *BB); // If there are any extra values that couldn't be folded into the switch - // then we evaluate them with an explicit branch first. Split the block + // then we evaluate them with an explicit branch first. Split the block // right before the condbr to handle it. if (ExtraCase) { BasicBlock *NewBB = @@ -3850,7 +3955,7 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) { // Simplify resume that is only used by a single (non-phi) landing pad. bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) { BasicBlock *BB = RI->getParent(); - LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI()); + auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI()); assert(RI->getValue() == LPInst && "Resume must unwind the exception that caused control to here"); @@ -4177,23 +4282,22 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) { IRBuilder<> Builder(TI); if (auto *BI = dyn_cast<BranchInst>(TI)) { if (BI->isUnconditional()) { - if (BI->getSuccessor(0) == BB) { - new UnreachableInst(TI->getContext(), TI); - TI->eraseFromParent(); - Changed = true; - } + assert(BI->getSuccessor(0) == BB && "Incorrect CFG"); + new UnreachableInst(TI->getContext(), TI); + TI->eraseFromParent(); + Changed = true; } else { Value* Cond = BI->getCondition(); if (BI->getSuccessor(0) == BB) { Builder.CreateAssumption(Builder.CreateNot(Cond)); Builder.CreateBr(BI->getSuccessor(1)); - EraseTerminatorAndDCECond(BI); - } else if (BI->getSuccessor(1) == BB) { + } else { + assert(BI->getSuccessor(1) == BB && "Incorrect CFG"); Builder.CreateAssumption(Cond); Builder.CreateBr(BI->getSuccessor(0)); - EraseTerminatorAndDCECond(BI); - Changed = true; } + EraseTerminatorAndDCECond(BI); + Changed = true; } } else if (auto *SI = dyn_cast<SwitchInst>(TI)) { SwitchInstProfUpdateWrapper SU(*SI); @@ -4275,6 +4379,17 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) { return true; } +static void createUnreachableSwitchDefault(SwitchInst *Switch) { + LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); + BasicBlock *NewDefaultBlock = + SplitBlockPredecessors(Switch->getDefaultDest(), Switch->getParent(), ""); + Switch->setDefaultDest(&*NewDefaultBlock); + SplitBlock(&*NewDefaultBlock, &NewDefaultBlock->front()); + auto *NewTerminator = NewDefaultBlock->getTerminator(); + new UnreachableInst(Switch->getContext(), NewTerminator); + EraseTerminatorAndDCECond(NewTerminator); +} + /// Turn a switch with two reachable destinations into an integer range /// comparison and branch. static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { @@ -4383,6 +4498,11 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { cast<PHINode>(BBI)->removeIncomingValue(SI->getParent()); } + // Clean up the default block - it may have phis or other instructions before + // the unreachable terminator. + if (!HasDefault) + createUnreachableSwitchDefault(SI); + // Drop the switch. SI->eraseFromParent(); @@ -4427,14 +4547,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC, if (HasDefault && DeadCases.empty() && NumUnknownBits < 64 /* avoid overflow */ && SI->getNumCases() == (1ULL << NumUnknownBits)) { - LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n"); - BasicBlock *NewDefault = - SplitBlockPredecessors(SI->getDefaultDest(), SI->getParent(), ""); - SI->setDefaultDest(&*NewDefault); - SplitBlock(&*NewDefault, &NewDefault->front()); - auto *OldTI = NewDefault->getTerminator(); - new UnreachableInst(SI->getContext(), OldTI); - EraseTerminatorAndDCECond(OldTI); + createUnreachableSwitchDefault(SI); return true; } @@ -5030,7 +5143,7 @@ SwitchLookupTable::SwitchLookupTable( Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Set the alignment to that of an array items. We will be only loading one // value out of it. - Array->setAlignment(DL.getPrefTypeAlignment(ValueType)); + Array->setAlignment(Align(DL.getPrefTypeAlignment(ValueType))); Kind = ArrayKind; } @@ -5259,7 +5372,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // Figure out the corresponding result for each case value and phi node in the // common destination, as well as the min and max case values. - assert(!empty(SI->cases())); + assert(!SI->cases().empty()); SwitchInst::CaseIt CI = SI->case_begin(); ConstantInt *MinCaseVal = CI->getCaseValue(); ConstantInt *MaxCaseVal = CI->getCaseValue(); @@ -5891,7 +6004,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator())) if (PBI != BI && PBI->isConditional()) - if (SimplifyCondBranchToCondBranch(PBI, BI, DL)) + if (SimplifyCondBranchToCondBranch(PBI, BI, DL, TTI)) return requestResimplify(); // Look for diamond patterns. @@ -5899,7 +6012,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB)) if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator())) if (PBI != BI && PBI->isConditional()) - if (mergeConditionalStores(PBI, BI, DL)) + if (mergeConditionalStores(PBI, BI, DL, TTI)) return requestResimplify(); return false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index e938ae6cb42f..fa3a9d21f3df 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/SizeOpts.h" @@ -47,7 +48,6 @@ static cl::opt<bool> cl::desc("Enable unsafe double to float " "shrinking for math lib calls")); - //===----------------------------------------------------------------------===// // Helper Functions //===----------------------------------------------------------------------===// @@ -177,7 +177,8 @@ static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len, if (!isOnlyUsedInComparisonWithZero(CI)) return false; - if (!isDereferenceableAndAlignedPointer(Str, 1, APInt(64, Len), DL)) + if (!isDereferenceableAndAlignedPointer(Str, Align::None(), APInt(64, Len), + DL)) return false; if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory)) @@ -186,6 +187,67 @@ static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len, return true; } +static void annotateDereferenceableBytes(CallInst *CI, + ArrayRef<unsigned> ArgNos, + uint64_t DereferenceableBytes) { + const Function *F = CI->getCaller(); + if (!F) + return; + for (unsigned ArgNo : ArgNos) { + uint64_t DerefBytes = DereferenceableBytes; + unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace(); + if (!llvm::NullPointerIsDefined(F, AS) || + CI->paramHasAttr(ArgNo, Attribute::NonNull)) + DerefBytes = std::max(CI->getDereferenceableOrNullBytes( + ArgNo + AttributeList::FirstArgIndex), + DereferenceableBytes); + + if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) < + DerefBytes) { + CI->removeParamAttr(ArgNo, Attribute::Dereferenceable); + if (!llvm::NullPointerIsDefined(F, AS) || + CI->paramHasAttr(ArgNo, Attribute::NonNull)) + CI->removeParamAttr(ArgNo, Attribute::DereferenceableOrNull); + CI->addParamAttr(ArgNo, Attribute::getWithDereferenceableBytes( + CI->getContext(), DerefBytes)); + } + } +} + +static void annotateNonNullBasedOnAccess(CallInst *CI, + ArrayRef<unsigned> ArgNos) { + Function *F = CI->getCaller(); + if (!F) + return; + + for (unsigned ArgNo : ArgNos) { + if (CI->paramHasAttr(ArgNo, Attribute::NonNull)) + continue; + unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace(); + if (llvm::NullPointerIsDefined(F, AS)) + continue; + + CI->addParamAttr(ArgNo, Attribute::NonNull); + annotateDereferenceableBytes(CI, ArgNo, 1); + } +} + +static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> ArgNos, + Value *Size, const DataLayout &DL) { + if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) { + annotateNonNullBasedOnAccess(CI, ArgNos); + annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue()); + } else if (isKnownNonZero(Size, DL)) { + annotateNonNullBasedOnAccess(CI, ArgNos); + const APInt *X, *Y; + uint64_t DerefMin = 1; + if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) { + DerefMin = std::min(X->getZExtValue(), Y->getZExtValue()); + annotateDereferenceableBytes(CI, ArgNos, DerefMin); + } + } +} + //===----------------------------------------------------------------------===// // String and Memory Library Call Optimizations //===----------------------------------------------------------------------===// @@ -194,10 +256,13 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) { // Extract some information from the instruction Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); + annotateNonNullBasedOnAccess(CI, {0, 1}); // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); - if (Len == 0) + if (Len) + annotateDereferenceableBytes(CI, 1, Len); + else return nullptr; --Len; // Unbias length. @@ -223,8 +288,9 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy(CpyDst, 1, Src, 1, - ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1)); + B.CreateMemCpy( + CpyDst, Align::None(), Src, Align::None(), + ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1)); return Dst; } @@ -232,24 +298,34 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) { // Extract some information from the instruction. Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); + Value *Size = CI->getArgOperand(2); uint64_t Len; + annotateNonNullBasedOnAccess(CI, 0); + if (isKnownNonZero(Size, DL)) + annotateNonNullBasedOnAccess(CI, 1); // We don't do anything if length is not constant. - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) + ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size); + if (LengthArg) { Len = LengthArg->getZExtValue(); - else + // strncat(x, c, 0) -> x + if (!Len) + return Dst; + } else { return nullptr; + } // See if we can get the length of the input string. uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) + if (SrcLen) { + annotateDereferenceableBytes(CI, 1, SrcLen); + --SrcLen; // Unbias length. + } else { return nullptr; - --SrcLen; // Unbias length. + } - // Handle the simple, do-nothing cases: // strncat(x, "", c) -> x - // strncat(x, c, 0) -> x - if (SrcLen == 0 || Len == 0) + if (SrcLen == 0) return Dst; // We don't optimize this case. @@ -265,13 +341,18 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); FunctionType *FT = Callee->getFunctionType(); Value *SrcStr = CI->getArgOperand(0); + annotateNonNullBasedOnAccess(CI, 0); // If the second operand is non-constant, see if we can compute the length // of the input string and turn this into memchr. ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); if (!CharC) { uint64_t Len = GetStringLength(SrcStr); - if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. + if (Len) + annotateDereferenceableBytes(CI, 0, Len); + else + return nullptr; + if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32. return nullptr; return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul. @@ -284,8 +365,8 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { StringRef Str; if (!getConstantStringInfo(SrcStr, Str)) { if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p) - return B.CreateGEP(B.getInt8Ty(), SrcStr, emitStrLen(SrcStr, B, DL, TLI), - "strchr"); + if (Value *StrLen = emitStrLen(SrcStr, B, DL, TLI)) + return B.CreateGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr"); return nullptr; } @@ -304,6 +385,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) { Value *SrcStr = CI->getArgOperand(0); ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + annotateNonNullBasedOnAccess(CI, 0); // Cannot fold anything if we're not looking for a constant. if (!CharC) @@ -351,7 +433,12 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) { // strcmp(P, "x") -> memcmp(P, "x", 2) uint64_t Len1 = GetStringLength(Str1P); + if (Len1) + annotateDereferenceableBytes(CI, 0, Len1); uint64_t Len2 = GetStringLength(Str2P); + if (Len2) + annotateDereferenceableBytes(CI, 1, Len2); + if (Len1 && Len2) { return emitMemCmp(Str1P, Str2P, ConstantInt::get(DL.getIntPtrType(CI->getContext()), @@ -374,17 +461,22 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) { TLI); } + annotateNonNullBasedOnAccess(CI, {0, 1}); return nullptr; } Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { - Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); + Value *Str1P = CI->getArgOperand(0); + Value *Str2P = CI->getArgOperand(1); + Value *Size = CI->getArgOperand(2); if (Str1P == Str2P) // strncmp(x,x,n) -> 0 return ConstantInt::get(CI->getType(), 0); + if (isKnownNonZero(Size, DL)) + annotateNonNullBasedOnAccess(CI, {0, 1}); // Get the length argument if it is constant. uint64_t Length; - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2))) + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size)) Length = LengthArg->getZExtValue(); else return nullptr; @@ -393,7 +485,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { return ConstantInt::get(CI->getType(), 0); if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) - return emitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, DL, TLI); + return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI); StringRef Str1, Str2; bool HasStr1 = getConstantStringInfo(Str1P, Str1); @@ -415,7 +507,11 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { CI->getType()); uint64_t Len1 = GetStringLength(Str1P); + if (Len1) + annotateDereferenceableBytes(CI, 0, Len1); uint64_t Len2 = GetStringLength(Str2P); + if (Len2) + annotateDereferenceableBytes(CI, 1, Len2); // strncmp to memcmp if (!HasStr1 && HasStr2) { @@ -437,20 +533,38 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) { return nullptr; } +Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilder<> &B) { + Value *Src = CI->getArgOperand(0); + ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + uint64_t SrcLen = GetStringLength(Src); + if (SrcLen && Size) { + annotateDereferenceableBytes(CI, 0, SrcLen); + if (SrcLen <= Size->getZExtValue() + 1) + return emitStrDup(Src, B, TLI); + } + + return nullptr; +} + Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) { Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); if (Dst == Src) // strcpy(x,x) -> x return Src; - + + annotateNonNullBasedOnAccess(CI, {0, 1}); // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); - if (Len == 0) + if (Len) + annotateDereferenceableBytes(CI, 1, Len); + else return nullptr; // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy(Dst, 1, Src, 1, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len)); + CallInst *NewCI = + B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len)); + NewCI->setAttributes(CI->getAttributes()); return Dst; } @@ -464,7 +578,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { // See if we can get the length of the input string. uint64_t Len = GetStringLength(Src); - if (Len == 0) + if (Len) + annotateDereferenceableBytes(CI, 1, Len); + else return nullptr; Type *PT = Callee->getFunctionType()->getParamType(0); @@ -474,7 +590,9 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) { // We have enough information to now generate the memcpy call to do the // copy for us. Make a memcpy to copy the nul byte with align = 1. - B.CreateMemCpy(Dst, 1, Src, 1, LenV); + CallInst *NewCI = + B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), LenV); + NewCI->setAttributes(CI->getAttributes()); return DstEnd; } @@ -482,37 +600,48 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); Value *Dst = CI->getArgOperand(0); Value *Src = CI->getArgOperand(1); - Value *LenOp = CI->getArgOperand(2); + Value *Size = CI->getArgOperand(2); + annotateNonNullBasedOnAccess(CI, 0); + if (isKnownNonZero(Size, DL)) + annotateNonNullBasedOnAccess(CI, 1); + + uint64_t Len; + if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size)) + Len = LengthArg->getZExtValue(); + else + return nullptr; + + // strncpy(x, y, 0) -> x + if (Len == 0) + return Dst; // See if we can get the length of the input string. uint64_t SrcLen = GetStringLength(Src); - if (SrcLen == 0) + if (SrcLen) { + annotateDereferenceableBytes(CI, 1, SrcLen); + --SrcLen; // Unbias length. + } else { return nullptr; - --SrcLen; + } if (SrcLen == 0) { // strncpy(x, "", y) -> memset(align 1 x, '\0', y) - B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); + CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align::None()); + AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0)); + NewCI->setAttributes(NewCI->getAttributes().addParamAttributes( + CI->getContext(), 0, ArgAttrs)); return Dst; } - uint64_t Len; - if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp)) - Len = LengthArg->getZExtValue(); - else - return nullptr; - - if (Len == 0) - return Dst; // strncpy(x, y, 0) -> x - // Let strncpy handle the zero padding if (Len > SrcLen + 1) return nullptr; Type *PT = Callee->getFunctionType()->getParamType(0); // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant] - B.CreateMemCpy(Dst, 1, Src, 1, ConstantInt::get(DL.getIntPtrType(PT), Len)); - + CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), + ConstantInt::get(DL.getIntPtrType(PT), Len)); + NewCI->setAttributes(CI->getAttributes()); return Dst; } @@ -608,7 +737,10 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B, } Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) { - return optimizeStringLength(CI, B, 8); + if (Value *V = optimizeStringLength(CI, B, 8)) + return V; + annotateNonNullBasedOnAccess(CI, 0); + return nullptr; } Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) { @@ -756,21 +888,35 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) { Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI); return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr; } + + annotateNonNullBasedOnAccess(CI, {0, 1}); + return nullptr; +} + +Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilder<> &B) { + if (isKnownNonZero(CI->getOperand(2), DL)) + annotateNonNullBasedOnAccess(CI, 0); return nullptr; } Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) { Value *SrcStr = CI->getArgOperand(0); + Value *Size = CI->getArgOperand(2); + annotateNonNullAndDereferenceable(CI, 0, Size, DL); ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); - ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + ConstantInt *LenC = dyn_cast<ConstantInt>(Size); // memchr(x, y, 0) -> null - if (LenC && LenC->isZero()) - return Constant::getNullValue(CI->getType()); + if (LenC) { + if (LenC->isZero()) + return Constant::getNullValue(CI->getType()); + } else { + // From now on we need at least constant length and string. + return nullptr; + } - // From now on we need at least constant length and string. StringRef Str; - if (!LenC || !getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false)) + if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false)) return nullptr; // Truncate the string to LenC. If Str is smaller than LenC we will still only @@ -913,6 +1059,7 @@ static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS, Ret = 1; return ConstantInt::get(CI->getType(), Ret); } + return nullptr; } @@ -925,12 +1072,19 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI, if (LHS == RHS) // memcmp(s,s,x) -> 0 return Constant::getNullValue(CI->getType()); + annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL); // Handle constant lengths. - if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) - if (Value *Res = optimizeMemCmpConstantSize(CI, LHS, RHS, - LenC->getZExtValue(), B, DL)) - return Res; + ConstantInt *LenC = dyn_cast<ConstantInt>(Size); + if (!LenC) + return nullptr; + // memcmp(d,s,0) -> 0 + if (LenC->getZExtValue() == 0) + return Constant::getNullValue(CI->getType()); + + if (Value *Res = + optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL)) + return Res; return nullptr; } @@ -939,9 +1093,9 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) { return V; // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0 - // `bcmp` can be more efficient than memcmp because it only has to know that - // there is a difference, not where it is. - if (isOnlyUsedInZeroEqualityComparison(CI) && TLI->has(LibFunc_bcmp)) { + // bcmp can be more efficient than memcmp because it only has to know that + // there is a difference, not how different one is to the other. + if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) { Value *LHS = CI->getArgOperand(0); Value *RHS = CI->getArgOperand(1); Value *Size = CI->getArgOperand(2); @@ -956,16 +1110,78 @@ Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) { + Value *Size = CI->getArgOperand(2); + annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL); + if (isa<IntrinsicInst>(CI)) + return nullptr; + // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n) - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, - CI->getArgOperand(2)); + CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(), + CI->getArgOperand(1), Align::None(), Size); + NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } +Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) { + Value *Dst = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2)); + ConstantInt *N = dyn_cast<ConstantInt>(CI->getArgOperand(3)); + StringRef SrcStr; + if (CI->use_empty() && Dst == Src) + return Dst; + // memccpy(d, s, c, 0) -> nullptr + if (N) { + if (N->isNullValue()) + return Constant::getNullValue(CI->getType()); + if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0, + /*TrimAtNul=*/false) || + !StopChar) + return nullptr; + } else { + return nullptr; + } + + // Wrap arg 'c' of type int to char + size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF); + if (Pos == StringRef::npos) { + if (N->getZExtValue() <= SrcStr.size()) { + B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), + CI->getArgOperand(3)); + return Constant::getNullValue(CI->getType()); + } + return nullptr; + } + + Value *NewN = + ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue())); + // memccpy -> llvm.memcpy + B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), NewN); + return Pos + 1 <= N->getZExtValue() + ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN) + : Constant::getNullValue(CI->getType()); +} + +Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilder<> &B) { + Value *Dst = CI->getArgOperand(0); + Value *N = CI->getArgOperand(2); + // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n + CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), CI->getArgOperand(1), + Align::None(), N); + NewCI->setAttributes(CI->getAttributes()); + return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N); +} + Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) { + Value *Size = CI->getArgOperand(2); + annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL); + if (isa<IntrinsicInst>(CI)) + return nullptr; + // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n) - B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, - CI->getArgOperand(2)); + CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(), + CI->getArgOperand(1), Align::None(), Size); + NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -1003,25 +1219,30 @@ Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) { B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator()); const DataLayout &DL = Malloc->getModule()->getDataLayout(); IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext()); - Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1), - Malloc->getArgOperand(0), Malloc->getAttributes(), - B, *TLI); - if (!Calloc) - return nullptr; - - Malloc->replaceAllUsesWith(Calloc); - eraseFromParent(Malloc); + if (Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1), + Malloc->getArgOperand(0), + Malloc->getAttributes(), B, *TLI)) { + substituteInParent(Malloc, Calloc); + return Calloc; + } - return Calloc; + return nullptr; } Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) { + Value *Size = CI->getArgOperand(2); + annotateNonNullAndDereferenceable(CI, 0, Size, DL); + if (isa<IntrinsicInst>(CI)) + return nullptr; + if (auto *Calloc = foldMallocMemset(CI, B)) return Calloc; // memset(p, v, n) -> llvm.memset(align 1 p, v, n) Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + CallInst *NewCI = + B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align::None()); + NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } @@ -1096,21 +1317,18 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B, if (!V[0] || (isBinary && !V[1])) return nullptr; - StringRef CalleeNm = CalleeFn->getName(); - AttributeList CalleeAt = CalleeFn->getAttributes(); - bool CalleeIn = CalleeFn->isIntrinsic(); - // If call isn't an intrinsic, check that it isn't within a function with the // same name as the float version of this call, otherwise the result is an // infinite loop. For example, from MinGW-w64: // // float expf(float val) { return (float) exp((double) val); } - if (!CalleeIn) { - const Function *Fn = CI->getFunction(); - StringRef FnName = Fn->getName(); - if (FnName.back() == 'f' && - FnName.size() == (CalleeNm.size() + 1) && - FnName.startswith(CalleeNm)) + StringRef CalleeName = CalleeFn->getName(); + bool IsIntrinsic = CalleeFn->isIntrinsic(); + if (!IsIntrinsic) { + StringRef CallerName = CI->getFunction()->getName(); + if (!CallerName.empty() && CallerName.back() == 'f' && + CallerName.size() == (CalleeName.size() + 1) && + CallerName.startswith(CalleeName)) return nullptr; } @@ -1120,16 +1338,16 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B, // g((double) float) -> (double) gf(float) Value *R; - if (CalleeIn) { + if (IsIntrinsic) { Module *M = CI->getModule(); Intrinsic::ID IID = CalleeFn->getIntrinsicID(); Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy()); R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]); + } else { + AttributeList CalleeAttrs = CalleeFn->getAttributes(); + R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs) + : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs); } - else - R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeNm, B, CalleeAt) - : emitUnaryFloatFnCall(V[0], CalleeNm, B, CalleeAt); - return B.CreateFPExt(R, B.getDoubleTy()); } @@ -1234,9 +1452,25 @@ static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) { return InnerChain[Exp]; } +// Return a properly extended 32-bit integer if the operation is an itofp. +static Value *getIntToFPVal(Value *I2F, IRBuilder<> &B) { + if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) { + Value *Op = cast<Instruction>(I2F)->getOperand(0); + // Make sure that the exponent fits inside an int32_t, + // thus avoiding any range issues that FP has not. + unsigned BitWidth = Op->getType()->getPrimitiveSizeInBits(); + if (BitWidth < 32 || + (BitWidth == 32 && isa<SIToFPInst>(I2F))) + return isa<SIToFPInst>(I2F) ? B.CreateSExt(Op, B.getInt32Ty()) + : B.CreateZExt(Op, B.getInt32Ty()); + } + + return nullptr; +} + /// Use exp{,2}(x * y) for pow(exp{,2}(x), y); -/// exp2(n * x) for pow(2.0 ** n, x); exp10(x) for pow(10.0, x); -/// exp2(log2(n) * x) for pow(n, x). +/// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x); +/// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x). Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1); AttributeList Attrs = Pow->getCalledFunction()->getAttributes(); @@ -1269,9 +1503,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { StringRef ExpName; Intrinsic::ID ID; Value *ExpFn; - LibFunc LibFnFloat; - LibFunc LibFnDouble; - LibFunc LibFnLongDouble; + LibFunc LibFnFloat, LibFnDouble, LibFnLongDouble; switch (LibFn) { default: @@ -1305,9 +1537,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { // elimination cannot be trusted to remove it, since it may have side // effects (e.g., errno). When the only consumer for the original // exp{,2}() is pow(), then it has to be explicitly erased. - BaseFn->replaceAllUsesWith(ExpFn); - eraseFromParent(BaseFn); - + substituteInParent(BaseFn, ExpFn); return ExpFn; } } @@ -1318,8 +1548,18 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { if (!match(Pow->getArgOperand(0), m_APFloat(BaseF))) return nullptr; + // pow(2.0, itofp(x)) -> ldexp(1.0, x) + if (match(Base, m_SpecificFP(2.0)) && + (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) && + hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { + if (Value *ExpoI = getIntToFPVal(Expo, B)) + return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI, + LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl, + B, Attrs); + } + // pow(2.0 ** n, x) -> exp2(n * x) - if (hasUnaryFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) { + if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) { APFloat BaseR = APFloat(1.0); BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored); BaseR = BaseR / *BaseF; @@ -1344,7 +1584,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { // pow(10.0, x) -> exp10(x) // TODO: There is no exp10() intrinsic yet, but some day there shall be one. if (match(Base, m_SpecificFP(10.0)) && - hasUnaryFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l)) + hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l)) return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l, B, Attrs); @@ -1359,17 +1599,15 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) { if (Log) { Value *FMul = B.CreateFMul(Log, Expo, "mul"); - if (Pow->doesNotAccessMemory()) { + if (Pow->doesNotAccessMemory()) return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty), FMul, "exp2"); - } else { - if (hasUnaryFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, - LibFunc_exp2l)) - return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, - LibFunc_exp2l, B, Attrs); - } + else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) + return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f, + LibFunc_exp2l, B, Attrs); } } + return nullptr; } @@ -1384,8 +1622,7 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno, } // Otherwise, use the libcall for sqrt(). - if (hasUnaryFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, - LibFunc_sqrtl)) + if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl)) // TODO: We also should check that the target can in fact lower the sqrt() // libcall. We currently have no way to ask this question, so we ask if // the target has a sqrt() libcall, which is not exactly the same. @@ -1407,6 +1644,11 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilder<> &B) { (!ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5))) return nullptr; + // Converting pow(X, -0.5) to 1/sqrt(X) may introduce an extra rounding step, + // so that requires fast-math-flags (afn or reassoc). + if (ExpoF->isNegative() && (!Pow->hasApproxFunc() && !Pow->hasAllowReassoc())) + return nullptr; + Sqrt = getSqrtCall(Base, Attrs, Pow->doesNotAccessMemory(), Mod, B, TLI); if (!Sqrt) return nullptr; @@ -1452,7 +1694,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { bool Ignored; // Bail out if simplifying libcalls to pow() is disabled. - if (!hasUnaryFloatFn(TLI, Ty, LibFunc_pow, LibFunc_powf, LibFunc_powl)) + if (!hasFloatFn(TLI, Ty, LibFunc_pow, LibFunc_powf, LibFunc_powl)) return nullptr; // Propagate the math semantics from the call to any created instructions. @@ -1504,7 +1746,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { // TODO: This whole transformation should be backend specific (e.g. some // backends might prefer libcalls or the limit for the exponent might // be different) and it should also consider optimizing for size. - APFloat LimF(ExpoF->getSemantics(), 33.0), + APFloat LimF(ExpoF->getSemantics(), 33), ExpoA(abs(*ExpoF)); if (ExpoA.compare(LimF) == APFloat::cmpLessThan) { // This transformation applies to integer or integer+0.5 exponents only. @@ -1558,16 +1800,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { // powf(x, itofp(y)) -> powi(x, y) if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) { - Value *IntExpo = cast<Instruction>(Expo)->getOperand(0); - Value *NewExpo = nullptr; - unsigned BitWidth = IntExpo->getType()->getPrimitiveSizeInBits(); - if (isa<SIToFPInst>(Expo) && BitWidth == 32) - NewExpo = IntExpo; - else if (BitWidth < 32) - NewExpo = isa<SIToFPInst>(Expo) ? B.CreateSExt(IntExpo, B.getInt32Ty()) - : B.CreateZExt(IntExpo, B.getInt32Ty()); - if (NewExpo) - return createPowWithIntegerExponent(Base, NewExpo, M, B); + if (Value *ExpoI = getIntToFPVal(Expo, B)) + return createPowWithIntegerExponent(Base, ExpoI, M, B); } return Shrunk; @@ -1575,45 +1809,25 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) { Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) { Function *Callee = CI->getCalledFunction(); - Value *Ret = nullptr; StringRef Name = Callee->getName(); - if (UnsafeFPShrink && Name == "exp2" && hasFloatVersion(Name)) + Value *Ret = nullptr; + if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) && + hasFloatVersion(Name)) Ret = optimizeUnaryDoubleFP(CI, B, true); + Type *Ty = CI->getType(); Value *Op = CI->getArgOperand(0); + // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32 // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32 - LibFunc LdExp = LibFunc_ldexpl; - if (Op->getType()->isFloatTy()) - LdExp = LibFunc_ldexpf; - else if (Op->getType()->isDoubleTy()) - LdExp = LibFunc_ldexp; - - if (TLI->has(LdExp)) { - Value *LdExpArg = nullptr; - if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { - if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) - LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty()); - } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { - if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) - LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty()); - } - - if (LdExpArg) { - Constant *One = ConstantFP::get(CI->getContext(), APFloat(1.0f)); - if (!Op->getType()->isFloatTy()) - One = ConstantExpr::getFPExtend(One, Op->getType()); - - Module *M = CI->getModule(); - FunctionCallee NewCallee = M->getOrInsertFunction( - TLI->getName(LdExp), Op->getType(), Op->getType(), B.getInt32Ty()); - CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg}); - if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) - CI->setCallingConv(F->getCallingConv()); - - return CI; - } + if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) && + hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) { + if (Value *Exp = getIntToFPVal(Op, B)) + return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI, + LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl, + B, CI->getCalledFunction()->getAttributes()); } + return Ret; } @@ -1644,48 +1858,155 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) { return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) }); } -Value *LibCallSimplifier::optimizeLog(CallInst *CI, IRBuilder<> &B) { - Function *Callee = CI->getCalledFunction(); +Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) { + Function *LogFn = Log->getCalledFunction(); + AttributeList Attrs = LogFn->getAttributes(); + StringRef LogNm = LogFn->getName(); + Intrinsic::ID LogID = LogFn->getIntrinsicID(); + Module *Mod = Log->getModule(); + Type *Ty = Log->getType(); Value *Ret = nullptr; - StringRef Name = Callee->getName(); - if (UnsafeFPShrink && hasFloatVersion(Name)) - Ret = optimizeUnaryDoubleFP(CI, B, true); - if (!CI->isFast()) - return Ret; - Value *Op1 = CI->getArgOperand(0); - auto *OpC = dyn_cast<CallInst>(Op1); + if (UnsafeFPShrink && hasFloatVersion(LogNm)) + Ret = optimizeUnaryDoubleFP(Log, B, true); // The earlier call must also be 'fast' in order to do these transforms. - if (!OpC || !OpC->isFast()) + CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0)); + if (!Log->isFast() || !Arg || !Arg->isFast() || !Arg->hasOneUse()) return Ret; - // log(pow(x,y)) -> y*log(x) - // This is only applicable to log, log2, log10. - if (Name != "log" && Name != "log2" && Name != "log10") + LibFunc LogLb, ExpLb, Exp2Lb, Exp10Lb, PowLb; + + // This is only applicable to log(), log2(), log10(). + if (TLI->getLibFunc(LogNm, LogLb)) + switch (LogLb) { + case LibFunc_logf: + LogID = Intrinsic::log; + ExpLb = LibFunc_expf; + Exp2Lb = LibFunc_exp2f; + Exp10Lb = LibFunc_exp10f; + PowLb = LibFunc_powf; + break; + case LibFunc_log: + LogID = Intrinsic::log; + ExpLb = LibFunc_exp; + Exp2Lb = LibFunc_exp2; + Exp10Lb = LibFunc_exp10; + PowLb = LibFunc_pow; + break; + case LibFunc_logl: + LogID = Intrinsic::log; + ExpLb = LibFunc_expl; + Exp2Lb = LibFunc_exp2l; + Exp10Lb = LibFunc_exp10l; + PowLb = LibFunc_powl; + break; + case LibFunc_log2f: + LogID = Intrinsic::log2; + ExpLb = LibFunc_expf; + Exp2Lb = LibFunc_exp2f; + Exp10Lb = LibFunc_exp10f; + PowLb = LibFunc_powf; + break; + case LibFunc_log2: + LogID = Intrinsic::log2; + ExpLb = LibFunc_exp; + Exp2Lb = LibFunc_exp2; + Exp10Lb = LibFunc_exp10; + PowLb = LibFunc_pow; + break; + case LibFunc_log2l: + LogID = Intrinsic::log2; + ExpLb = LibFunc_expl; + Exp2Lb = LibFunc_exp2l; + Exp10Lb = LibFunc_exp10l; + PowLb = LibFunc_powl; + break; + case LibFunc_log10f: + LogID = Intrinsic::log10; + ExpLb = LibFunc_expf; + Exp2Lb = LibFunc_exp2f; + Exp10Lb = LibFunc_exp10f; + PowLb = LibFunc_powf; + break; + case LibFunc_log10: + LogID = Intrinsic::log10; + ExpLb = LibFunc_exp; + Exp2Lb = LibFunc_exp2; + Exp10Lb = LibFunc_exp10; + PowLb = LibFunc_pow; + break; + case LibFunc_log10l: + LogID = Intrinsic::log10; + ExpLb = LibFunc_expl; + Exp2Lb = LibFunc_exp2l; + Exp10Lb = LibFunc_exp10l; + PowLb = LibFunc_powl; + break; + default: + return Ret; + } + else if (LogID == Intrinsic::log || LogID == Intrinsic::log2 || + LogID == Intrinsic::log10) { + if (Ty->getScalarType()->isFloatTy()) { + ExpLb = LibFunc_expf; + Exp2Lb = LibFunc_exp2f; + Exp10Lb = LibFunc_exp10f; + PowLb = LibFunc_powf; + } else if (Ty->getScalarType()->isDoubleTy()) { + ExpLb = LibFunc_exp; + Exp2Lb = LibFunc_exp2; + Exp10Lb = LibFunc_exp10; + PowLb = LibFunc_pow; + } else + return Ret; + } else return Ret; IRBuilder<>::FastMathFlagGuard Guard(B); - FastMathFlags FMF; - FMF.setFast(); - B.setFastMathFlags(FMF); + B.setFastMathFlags(FastMathFlags::getFast()); + + Intrinsic::ID ArgID = Arg->getIntrinsicID(); + LibFunc ArgLb = NotLibFunc; + TLI->getLibFunc(Arg, ArgLb); + + // log(pow(x,y)) -> y*log(x) + if (ArgLb == PowLb || ArgID == Intrinsic::pow) { + Value *LogX = + Log->doesNotAccessMemory() + ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty), + Arg->getOperand(0), "log") + : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs); + Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul"); + // Since pow() may have side effects, e.g. errno, + // dead code elimination may not be trusted to remove it. + substituteInParent(Arg, MulY); + return MulY; + } + + // log(exp{,2,10}(y)) -> y*log({e,2,10}) + // TODO: There is no exp10() intrinsic yet. + if (ArgLb == ExpLb || ArgLb == Exp2Lb || ArgLb == Exp10Lb || + ArgID == Intrinsic::exp || ArgID == Intrinsic::exp2) { + Constant *Eul; + if (ArgLb == ExpLb || ArgID == Intrinsic::exp) + // FIXME: Add more precise value of e for long double. + Eul = ConstantFP::get(Log->getType(), numbers::e); + else if (ArgLb == Exp2Lb || ArgID == Intrinsic::exp2) + Eul = ConstantFP::get(Log->getType(), 2.0); + else + Eul = ConstantFP::get(Log->getType(), 10.0); + Value *LogE = Log->doesNotAccessMemory() + ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty), + Eul, "log") + : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs); + Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul"); + // Since exp() may have side effects, e.g. errno, + // dead code elimination may not be trusted to remove it. + substituteInParent(Arg, MulY); + return MulY; + } - LibFunc Func; - Function *F = OpC->getCalledFunction(); - if (F && ((TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) && - Func == LibFunc_pow) || F->getIntrinsicID() == Intrinsic::pow)) - return B.CreateFMul(OpC->getArgOperand(1), - emitUnaryFloatFnCall(OpC->getOperand(0), Callee->getName(), B, - Callee->getAttributes()), "mul"); - - // log(exp2(y)) -> y*log(2) - if (F && Name == "log" && TLI->getLibFunc(F->getName(), Func) && - TLI->has(Func) && Func == LibFunc_exp2) - return B.CreateFMul( - OpC->getArgOperand(0), - emitUnaryFloatFnCall(ConstantFP::get(CI->getType(), 2.0), - Callee->getName(), B, Callee->getAttributes()), - "logmul"); return Ret; } @@ -2137,6 +2458,7 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) { return New; } + annotateNonNullBasedOnAccess(CI, 0); return nullptr; } @@ -2154,9 +2476,11 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; // we found a format specifier, bail out. // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1) - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, - ConstantInt::get(DL.getIntPtrType(CI->getContext()), - FormatStr.size() + 1)); // Copy the null byte. + B.CreateMemCpy( + CI->getArgOperand(0), Align::None(), CI->getArgOperand(1), + Align::None(), + ConstantInt::get(DL.getIntPtrType(CI->getContext()), + FormatStr.size() + 1)); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); } @@ -2191,7 +2515,8 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) { return nullptr; Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc"); - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(2), 1, IncLen); + B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(2), + Align::None(), IncLen); // The sprintf result is the unincremented number of bytes in the string. return B.CreateIntCast(Len, CI->getType(), false); @@ -2231,21 +2556,21 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) { return New; } + annotateNonNullBasedOnAccess(CI, {0, 1}); return nullptr; } Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { - // Check for a fixed format string. - StringRef FormatStr; - if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr)) - return nullptr; - // Check for size ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1)); if (!Size) return nullptr; uint64_t N = Size->getZExtValue(); + // Check for a fixed format string. + StringRef FormatStr; + if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr)) + return nullptr; // If we just have a format string (nothing else crazy) transform it. if (CI->getNumArgOperands() == 3) { @@ -2262,7 +2587,8 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt, // strlen(fmt)+1) B.CreateMemCpy( - CI->getArgOperand(0), 1, CI->getArgOperand(2), 1, + CI->getArgOperand(0), Align::None(), CI->getArgOperand(2), + Align::None(), ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size() + 1)); // Copy the null byte. return ConstantInt::get(CI->getType(), FormatStr.size()); @@ -2303,7 +2629,8 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) { else if (N < Str.size() + 1) return nullptr; - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(3), 1, + B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(3), + Align::None(), ConstantInt::get(CI->getType(), Str.size() + 1)); // The snprintf result is the unincremented number of bytes in the string. @@ -2318,6 +2645,8 @@ Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) { return V; } + if (isKnownNonZero(CI->getOperand(1), DL)) + annotateNonNullBasedOnAccess(CI, 0); return nullptr; } @@ -2442,7 +2771,8 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) { // Don't rewrite fputs to fwrite when optimising for size because fwrite // requires more arguments and thus extra MOVs are required. bool OptForSize = CI->getFunction()->hasOptSize() || - llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); + llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI, + PGSOQueryType::IRPass); if (OptForSize) return nullptr; @@ -2503,6 +2833,7 @@ Value *LibCallSimplifier::optimizeFRead(CallInst *CI, IRBuilder<> &B) { } Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { + annotateNonNullBasedOnAccess(CI, 0); if (!CI->use_empty()) return nullptr; @@ -2515,6 +2846,13 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { return nullptr; } +Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) { + // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) + return B.CreateMemMove(CI->getArgOperand(1), Align::None(), + CI->getArgOperand(0), Align::None(), + CI->getArgOperand(2)); +} + bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { LibFunc Func; SmallString<20> FloatFuncName = FuncName; @@ -2557,6 +2895,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, return optimizeStrLen(CI, Builder); case LibFunc_strpbrk: return optimizeStrPBrk(CI, Builder); + case LibFunc_strndup: + return optimizeStrNDup(CI, Builder); case LibFunc_strtol: case LibFunc_strtod: case LibFunc_strtof: @@ -2573,12 +2913,18 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, return optimizeStrStr(CI, Builder); case LibFunc_memchr: return optimizeMemChr(CI, Builder); + case LibFunc_memrchr: + return optimizeMemRChr(CI, Builder); case LibFunc_bcmp: return optimizeBCmp(CI, Builder); case LibFunc_memcmp: return optimizeMemCmp(CI, Builder); case LibFunc_memcpy: return optimizeMemCpy(CI, Builder); + case LibFunc_memccpy: + return optimizeMemCCpy(CI, Builder); + case LibFunc_mempcpy: + return optimizeMemPCpy(CI, Builder); case LibFunc_memmove: return optimizeMemMove(CI, Builder); case LibFunc_memset: @@ -2587,6 +2933,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, return optimizeRealloc(CI, Builder); case LibFunc_wcslen: return optimizeWcslen(CI, Builder); + case LibFunc_bcopy: + return optimizeBCopy(CI, Builder); default: break; } @@ -2626,11 +2974,21 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI, case LibFunc_sqrt: case LibFunc_sqrtl: return optimizeSqrt(CI, Builder); + case LibFunc_logf: case LibFunc_log: + case LibFunc_logl: + case LibFunc_log10f: case LibFunc_log10: + case LibFunc_log10l: + case LibFunc_log1pf: case LibFunc_log1p: + case LibFunc_log1pl: + case LibFunc_log2f: case LibFunc_log2: + case LibFunc_log2l: + case LibFunc_logbf: case LibFunc_logb: + case LibFunc_logbl: return optimizeLog(CI, Builder); case LibFunc_tan: case LibFunc_tanf: @@ -2721,10 +3079,18 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { case Intrinsic::exp2: return optimizeExp2(CI, Builder); case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: return optimizeLog(CI, Builder); case Intrinsic::sqrt: return optimizeSqrt(CI, Builder); // TODO: Use foldMallocMemset() with memset intrinsic. + case Intrinsic::memset: + return optimizeMemSet(CI, Builder); + case Intrinsic::memcpy: + return optimizeMemCpy(CI, Builder); + case Intrinsic::memmove: + return optimizeMemMove(CI, Builder); default: return nullptr; } @@ -2740,8 +3106,7 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) { IRBuilder<> TmpBuilder(SimplifiedCI); if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) { // If we were able to further simplify, remove the now redundant call. - SimplifiedCI->replaceAllUsesWith(V); - eraseFromParent(SimplifiedCI); + substituteInParent(SimplifiedCI, V); return V; } } @@ -2898,7 +3263,9 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, uint64_t Len = GetStringLength(CI->getArgOperand(*StrOp)); // If the length is 0 we don't know how long it is and so we can't // remove the check. - if (Len == 0) + if (Len) + annotateDereferenceableBytes(CI, *StrOp, Len); + else return false; return ObjSizeCI->getZExtValue() >= Len; } @@ -2915,8 +3282,10 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, IRBuilder<> &B) { if (isFortifiedCallFoldable(CI, 3, 2)) { - B.CreateMemCpy(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, - CI->getArgOperand(2)); + CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(), + CI->getArgOperand(1), Align::None(), + CI->getArgOperand(2)); + NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } return nullptr; @@ -2925,8 +3294,10 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI, Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI, IRBuilder<> &B) { if (isFortifiedCallFoldable(CI, 3, 2)) { - B.CreateMemMove(CI->getArgOperand(0), 1, CI->getArgOperand(1), 1, - CI->getArgOperand(2)); + CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(), + CI->getArgOperand(1), Align::None(), + CI->getArgOperand(2)); + NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } return nullptr; @@ -2938,7 +3309,9 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI, if (isFortifiedCallFoldable(CI, 3, 2)) { Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); - B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); + CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, + CI->getArgOperand(2), Align::None()); + NewCI->setAttributes(CI->getAttributes()); return CI->getArgOperand(0); } return nullptr; @@ -2974,7 +3347,9 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI, // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk. uint64_t Len = GetStringLength(Src); - if (Len == 0) + if (Len) + annotateDereferenceableBytes(CI, 1, Len); + else return nullptr; Type *SizeTTy = DL.getIntPtrType(CI->getContext()); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp index 1519751197d2..d2a400027d4b 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SizeOpts.cpp @@ -10,28 +10,80 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/BlockFrequencyInfo.h" -#include "llvm/Analysis/ProfileSummaryInfo.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/SizeOpts.h" + using namespace llvm; -static cl::opt<bool> ProfileGuidedSizeOpt( +cl::opt<bool> EnablePGSO( "pgso", cl::Hidden, cl::init(true), - cl::desc("Enable the profile guided size optimization. ")); - -bool llvm::shouldOptimizeForSize(Function *F, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI) { - assert(F); - if (!PSI || !BFI || !PSI->hasProfileSummary()) - return false; - return ProfileGuidedSizeOpt && PSI->isFunctionColdInCallGraph(F, *BFI); + cl::desc("Enable the profile guided size optimizations. ")); + +cl::opt<bool> PGSOLargeWorkingSetSizeOnly( + "pgso-lwss-only", cl::Hidden, cl::init(true), + cl::desc("Apply the profile guided size optimizations only " + "if the working set size is large (except for cold code.)")); + +cl::opt<bool> PGSOColdCodeOnly( + "pgso-cold-code-only", cl::Hidden, cl::init(true), + cl::desc("Apply the profile guided size optimizations only " + "to cold code.")); + +cl::opt<bool> PGSOIRPassOrTestOnly( + "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(false), + cl::desc("Apply the profile guided size optimizations only" + "to the IR passes or tests.")); + +cl::opt<bool> ForcePGSO( + "force-pgso", cl::Hidden, cl::init(false), + cl::desc("Force the (profiled-guided) size optimizations. ")); + +cl::opt<int> PgsoCutoffInstrProf( + "pgso-cutoff-instr-prof", cl::Hidden, cl::init(250000), cl::ZeroOrMore, + cl::desc("The profile guided size optimization profile summary cutoff " + "for instrumentation profile.")); + +cl::opt<int> PgsoCutoffSampleProf( + "pgso-cutoff-sample-prof", cl::Hidden, cl::init(800000), cl::ZeroOrMore, + cl::desc("The profile guided size optimization profile summary cutoff " + "for sample profile.")); + +namespace { +struct BasicBlockBFIAdapter { + static bool isFunctionColdInCallGraph(const Function *F, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo &BFI) { + return PSI->isFunctionColdInCallGraph(F, BFI); + } + static bool isFunctionHotInCallGraphNthPercentile(int CutOff, + const Function *F, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo &BFI) { + return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI); + } + static bool isColdBlock(const BasicBlock *BB, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { + return PSI->isColdBlock(BB, BFI); + } + static bool isHotBlockNthPercentile(int CutOff, + const BasicBlock *BB, + ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI) { + return PSI->isHotBlockNthPercentile(CutOff, BB, BFI); + } +}; +} // end anonymous namespace + +bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, + PGSOQueryType QueryType) { + return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI, + QueryType); } -bool llvm::shouldOptimizeForSize(BasicBlock *BB, ProfileSummaryInfo *PSI, - BlockFrequencyInfo *BFI) { - assert(BB); - if (!PSI || !BFI || !PSI->hasProfileSummary()) - return false; - return ProfileGuidedSizeOpt && PSI->isColdBlock(BB, BFI); +bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, + PGSOQueryType QueryType) { + return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI, + QueryType); } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp index 50844cf9d1c5..7880ea1c6c47 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripGCRelocates.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Statepoint.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/raw_ostream.h" diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp index 97a4533fabe5..21cbbfb140b6 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/DebugInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Utils.h" using namespace llvm; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp index 456724779b43..aacf81d83519 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/SymbolRewriter.cpp @@ -69,6 +69,7 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Module.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -380,11 +381,11 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, // TODO see if there is a more elegant solution to selecting the rewrite // descriptor type if (!Target.empty()) - DL->push_back(llvm::make_unique<ExplicitRewriteFunctionDescriptor>( + DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>( Source, Target, Naked)); else DL->push_back( - llvm::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform)); + std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform)); return true; } @@ -442,11 +443,11 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, } if (!Target.empty()) - DL->push_back(llvm::make_unique<ExplicitRewriteGlobalVariableDescriptor>( + DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>( Source, Target, /*Naked*/ false)); else - DL->push_back(llvm::make_unique<PatternRewriteGlobalVariableDescriptor>( + DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>( Source, Transform)); return true; @@ -505,11 +506,11 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K, } if (!Target.empty()) - DL->push_back(llvm::make_unique<ExplicitRewriteNamedAliasDescriptor>( + DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>( Source, Target, /*Naked*/ false)); else - DL->push_back(llvm::make_unique<PatternRewriteNamedAliasDescriptor>( + DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>( Source, Transform)); return true; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp index 7f7bdf8a3d6d..9af39d9a0dd1 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp @@ -18,10 +18,16 @@ #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Transforms/Utils.h" using namespace llvm; char UnifyFunctionExitNodes::ID = 0; + +UnifyFunctionExitNodes::UnifyFunctionExitNodes() : FunctionPass(ID) { + initializeUnifyFunctionExitNodesPass(*PassRegistry::getPassRegistry()); +} + INITIALIZE_PASS(UnifyFunctionExitNodes, "mergereturn", "Unify function exit nodes", false, false) diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp index 5272ab6e95d5..7769c7493cda 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/Utils.cpp @@ -39,6 +39,7 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) { initializeMetaRenamerPass(Registry); initializeStripGCRelocatesPass(Registry); initializePredicateInfoPrinterLegacyPassPass(Registry); + initializeInjectTLIMappingsLegacyPass(Registry); } /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp index a77bf50fe10b..591e1fd2dbee 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/VNCoercion.cpp @@ -431,7 +431,7 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy, PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal); NewLoad->takeName(SrcVal); - NewLoad->setAlignment(SrcVal->getAlignment()); + NewLoad->setAlignment(MaybeAlign(SrcVal->getAlignment())); LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n"); LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n"); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp b/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp index fbc3407c301f..da68d3713b40 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -27,8 +27,8 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalIndirectSymbol.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" @@ -66,7 +66,7 @@ struct WorklistEntry { enum EntryKind { MapGlobalInit, MapAppendingVar, - MapGlobalAliasee, + MapGlobalIndirectSymbol, RemapFunction }; struct GVInitTy { @@ -77,9 +77,9 @@ struct WorklistEntry { GlobalVariable *GV; Constant *InitPrefix; }; - struct GlobalAliaseeTy { - GlobalAlias *GA; - Constant *Aliasee; + struct GlobalIndirectSymbolTy { + GlobalIndirectSymbol *GIS; + Constant *Target; }; unsigned Kind : 2; @@ -89,7 +89,7 @@ struct WorklistEntry { union { GVInitTy GVInit; AppendingGVTy AppendingGV; - GlobalAliaseeTy GlobalAliasee; + GlobalIndirectSymbolTy GlobalIndirectSymbol; Function *RemapF; } Data; }; @@ -161,8 +161,8 @@ public: bool IsOldCtorDtor, ArrayRef<Constant *> NewMembers, unsigned MCID); - void scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee, - unsigned MCID); + void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target, + unsigned MCID); void scheduleRemapFunction(Function &F, unsigned MCID); void flush(); @@ -172,7 +172,7 @@ private: void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, bool IsOldCtorDtor, ArrayRef<Constant *> NewMembers); - void mapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee); + void mapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target); void remapFunction(Function &F, ValueToValueMapTy &VM); ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; } @@ -774,20 +774,6 @@ Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) { return *getMappedOp(&FirstN); } -namespace { - -struct MapMetadataDisabler { - ValueToValueMapTy &VM; - - MapMetadataDisabler(ValueToValueMapTy &VM) : VM(VM) { - VM.disableMapMetadata(); - } - - ~MapMetadataDisabler() { VM.enableMapMetadata(); } -}; - -} // end anonymous namespace - Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) { // If the value already exists in the map, use it. if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD)) @@ -802,9 +788,6 @@ Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) { return const_cast<Metadata *>(MD); if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) { - // Disallow recursion into metadata mapping through mapValue. - MapMetadataDisabler MMD(getVM()); - // Don't memoize ConstantAsMetadata. Instead of lasting until the // LLVMContext is destroyed, they can be deleted when the GlobalValue they // reference is destructed. These aren't super common, so the extra @@ -846,9 +829,9 @@ void Mapper::flush() { AppendingInits.resize(PrefixSize); break; } - case WorklistEntry::MapGlobalAliasee: - E.Data.GlobalAliasee.GA->setAliasee( - mapConstant(E.Data.GlobalAliasee.Aliasee)); + case WorklistEntry::MapGlobalIndirectSymbol: + E.Data.GlobalIndirectSymbol.GIS->setIndirectSymbol( + mapConstant(E.Data.GlobalIndirectSymbol.Target)); break; case WorklistEntry::RemapFunction: remapFunction(*E.Data.RemapF); @@ -1041,16 +1024,16 @@ void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV, AppendingInits.append(NewMembers.begin(), NewMembers.end()); } -void Mapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee, - unsigned MCID) { - assert(AlreadyScheduled.insert(&GA).second && "Should not reschedule"); +void Mapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, + Constant &Target, unsigned MCID) { + assert(AlreadyScheduled.insert(&GIS).second && "Should not reschedule"); assert(MCID < MCs.size() && "Invalid mapping context"); WorklistEntry WE; - WE.Kind = WorklistEntry::MapGlobalAliasee; + WE.Kind = WorklistEntry::MapGlobalIndirectSymbol; WE.MCID = MCID; - WE.Data.GlobalAliasee.GA = &GA; - WE.Data.GlobalAliasee.Aliasee = &Aliasee; + WE.Data.GlobalIndirectSymbol.GIS = &GIS; + WE.Data.GlobalIndirectSymbol.Target = &Target; Worklist.push_back(WE); } @@ -1147,9 +1130,10 @@ void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV, GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID); } -void ValueMapper::scheduleMapGlobalAliasee(GlobalAlias &GA, Constant &Aliasee, - unsigned MCID) { - getAsMapper(pImpl)->scheduleMapGlobalAliasee(GA, Aliasee, MCID); +void ValueMapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, + Constant &Target, + unsigned MCID) { + getAsMapper(pImpl)->scheduleMapGlobalIndirectSymbol(GIS, Target, MCID); } void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) { diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 4273080ddd91..7478daa2a0a5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -38,6 +38,7 @@ // could use this pass (with some modifications), but currently it implements // its own pass to do something similar to what we do here. +#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/MapVector.h" @@ -52,7 +53,6 @@ #include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Attributes.h" @@ -71,14 +71,15 @@ #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" -#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include <algorithm> #include <cassert> #include <cstdlib> @@ -147,7 +148,7 @@ private: static const unsigned MaxDepth = 3; bool isConsecutiveAccess(Value *A, Value *B); - bool areConsecutivePointers(Value *PtrA, Value *PtrB, const APInt &PtrDelta, + bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta, unsigned Depth = 0) const; bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta, unsigned Depth) const; @@ -336,14 +337,29 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { } bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, - const APInt &PtrDelta, - unsigned Depth) const { + APInt PtrDelta, unsigned Depth) const { unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType()); APInt OffsetA(PtrBitWidth, 0); APInt OffsetB(PtrBitWidth, 0); PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA); PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB); + unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType()); + + if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType())) + return false; + + // In case if we have to shrink the pointer + // stripAndAccumulateInBoundsConstantOffsets should properly handle a + // possible overflow and the value should fit into a smallest data type + // used in the cast/gep chain. + assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth && + OffsetB.getMinSignedBits() <= NewPtrBitWidth); + + OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth); + OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth); + PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth); + APInt OffsetDelta = OffsetB - OffsetA; // Check if they are based on the same pointer. That makes the offsets @@ -650,7 +666,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) { // We can ignore the alias if the we have a load store pair and the load // is known to be invariant. The load cannot be clobbered by the store. auto IsInvariantLoad = [](const LoadInst *LI) -> bool { - return LI->getMetadata(LLVMContext::MD_invariant_load); + return LI->hasMetadata(LLVMContext::MD_invariant_load); }; // We can ignore the alias as long as the load comes before the store, @@ -1077,7 +1093,7 @@ bool Vectorizer::vectorizeLoadChain( LoadInst *L0 = cast<LoadInst>(Chain[0]); // If the vector has an int element, default to int for the whole load. - Type *LoadTy; + Type *LoadTy = nullptr; for (const auto &V : Chain) { LoadTy = cast<LoadInst>(V)->getType(); if (LoadTy->isIntOrIntVectorTy()) @@ -1089,6 +1105,7 @@ bool Vectorizer::vectorizeLoadChain( break; } } + assert(LoadTy && "Can't determine LoadInst type from chain"); unsigned Sz = DL.getTypeSizeInBits(LoadTy); unsigned AS = L0->getPointerAddressSpace(); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 138f18e49c92..3f943f4c0688 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -13,7 +13,10 @@ // pass. It should be easy to create an analysis pass around it if there // is a need (but D45420 needs to happen first). // +#include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IntrinsicInst.h" @@ -47,38 +50,6 @@ static const unsigned MaxInterleaveFactor = 16; namespace llvm { -#ifndef NDEBUG -static void debugVectorizationFailure(const StringRef DebugMsg, - Instruction *I) { - dbgs() << "LV: Not vectorizing: " << DebugMsg; - if (I != nullptr) - dbgs() << " " << *I; - else - dbgs() << '.'; - dbgs() << '\n'; -} -#endif - -OptimizationRemarkAnalysis createLVMissedAnalysis(const char *PassName, - StringRef RemarkName, - Loop *TheLoop, - Instruction *I) { - Value *CodeRegion = TheLoop->getHeader(); - DebugLoc DL = TheLoop->getStartLoc(); - - if (I) { - CodeRegion = I->getParent(); - // If there is no debug location attached to the instruction, revert back to - // using the loop's. - if (I->getDebugLoc()) - DL = I->getDebugLoc(); - } - - OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); - R << "loop not vectorized: "; - return R; -} - bool LoopVectorizeHints::Hint::validate(unsigned Val) { switch (Kind) { case HK_WIDTH: @@ -88,6 +59,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) { case HK_FORCE: return (Val <= 1); case HK_ISVECTORIZED: + case HK_PREDICATE: return (Val == 0 || Val == 1); } return false; @@ -99,7 +71,9 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L, : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH), Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL), Force("vectorize.enable", FK_Undefined, HK_FORCE), - IsVectorized("isvectorized", 0, HK_ISVECTORIZED), TheLoop(L), ORE(ORE) { + IsVectorized("isvectorized", 0, HK_ISVECTORIZED), + Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L), + ORE(ORE) { // Populate values with existing loop metadata. getHintsFromMetadata(); @@ -250,7 +224,7 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) { return; unsigned Val = C->getZExtValue(); - Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized}; + Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate}; for (auto H : Hints) { if (Name == H->Name) { if (H->validate(Val)) @@ -435,7 +409,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap(); - int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false); + bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize(); + int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false); if (Stride == 1 || Stride == -1) return Stride; return 0; @@ -445,14 +420,6 @@ bool LoopVectorizationLegality::isUniform(Value *V) { return LAI->isUniform(V); } -void LoopVectorizationLegality::reportVectorizationFailure( - const StringRef DebugMsg, const StringRef OREMsg, - const StringRef ORETag, Instruction *I) const { - LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); - ORE->emit(createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), - ORETag, TheLoop, I) << OREMsg); -} - bool LoopVectorizationLegality::canVectorizeOuterLoop() { assert(!TheLoop->empty() && "We are not vectorizing an outer loop."); // Store the result and return it at the end instead of exiting early, in case @@ -467,7 +434,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { if (!Br) { reportVectorizationFailure("Unsupported basic block terminator", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -486,7 +453,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { !LI->isLoopHeader(Br->getSuccessor(1))) { reportVectorizationFailure("Unsupported conditional branch", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -500,7 +467,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { TheLoop /*context outer loop*/)) { reportVectorizationFailure("Outer loop contains divergent loops", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -511,7 +478,7 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() { if (!setupOuterLoopInductions()) { reportVectorizationFailure("Unsupported outer loop Phi(s)", "Unsupported outer loop Phi(s)", - "UnsupportedPhi"); + "UnsupportedPhi", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -618,7 +585,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { !PhiTy->isPointerTy()) { reportVectorizationFailure("Found a non-int non-pointer PHI", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); return false; } @@ -639,7 +606,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (Phi->getNumIncomingValues() != 2) { reportVectorizationFailure("Found an invalid PHI", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood", Phi); + "CFGNotUnderstood", ORE, TheLoop, Phi); return false; } @@ -691,7 +658,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { reportVectorizationFailure("Found an unidentified PHI", "value that could not be identified as " "reduction is used outside the loop", - "NonReductionValueUsedOutsideLoop", Phi); + "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi); return false; } // end of PHI handling @@ -722,11 +689,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { "library call cannot be vectorized. " "Try compiling with -fno-math-errno, -ffast-math, " "or similar flags", - "CantVectorizeLibcall", CI); + "CantVectorizeLibcall", ORE, TheLoop, CI); } else { reportVectorizationFailure("Found a non-intrinsic callsite", "call instruction cannot be vectorized", - "CantVectorizeLibcall", CI); + "CantVectorizeLibcall", ORE, TheLoop, CI); } return false; } @@ -741,7 +708,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) { reportVectorizationFailure("Found unvectorizable intrinsic", "intrinsic instruction cannot be vectorized", - "CantVectorizeIntrinsic", CI); + "CantVectorizeIntrinsic", ORE, TheLoop, CI); return false; } } @@ -754,7 +721,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { isa<ExtractElementInst>(I)) { reportVectorizationFailure("Found unvectorizable type", "instruction return type cannot be vectorized", - "CantVectorizeInstructionReturnType", &I); + "CantVectorizeInstructionReturnType", ORE, TheLoop, &I); return false; } @@ -764,7 +731,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (!VectorType::isValidElementType(T)) { reportVectorizationFailure("Store instruction cannot be vectorized", "store instruction cannot be vectorized", - "CantVectorizeStore", ST); + "CantVectorizeStore", ORE, TheLoop, ST); return false; } @@ -774,12 +741,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Arbitrarily try a vector of 2 elements. Type *VecTy = VectorType::get(T, /*NumElements=*/2); assert(VecTy && "did not find vectorized version of stored type"); - unsigned Alignment = getLoadStoreAlignment(ST); - if (!TTI->isLegalNTStore(VecTy, Alignment)) { + const MaybeAlign Alignment = getLoadStoreAlignment(ST); + assert(Alignment && "Alignment should be set"); + if (!TTI->isLegalNTStore(VecTy, *Alignment)) { reportVectorizationFailure( "nontemporal store instruction cannot be vectorized", "nontemporal store instruction cannot be vectorized", - "CantVectorizeNontemporalStore", ST); + "CantVectorizeNontemporalStore", ORE, TheLoop, ST); return false; } } @@ -790,12 +758,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // supported on the target (arbitrarily try a vector of 2 elements). Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2); assert(VecTy && "did not find vectorized version of load type"); - unsigned Alignment = getLoadStoreAlignment(LD); - if (!TTI->isLegalNTLoad(VecTy, Alignment)) { + const MaybeAlign Alignment = getLoadStoreAlignment(LD); + assert(Alignment && "Alignment should be set"); + if (!TTI->isLegalNTLoad(VecTy, *Alignment)) { reportVectorizationFailure( "nontemporal load instruction cannot be vectorized", "nontemporal load instruction cannot be vectorized", - "CantVectorizeNontemporalLoad", LD); + "CantVectorizeNontemporalLoad", ORE, TheLoop, LD); return false; } } @@ -824,7 +793,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { } reportVectorizationFailure("Value cannot be used outside the loop", "value cannot be used outside the loop", - "ValueUsedOutsideLoop", &I); + "ValueUsedOutsideLoop", ORE, TheLoop, &I); return false; } } // next instr. @@ -834,18 +803,30 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { if (Inductions.empty()) { reportVectorizationFailure("Did not find one integer induction var", "loop induction variable could not be identified", - "NoInductionVariable"); + "NoInductionVariable", ORE, TheLoop); return false; } else if (!WidestIndTy) { reportVectorizationFailure("Did not find one integer induction var", "integer loop induction variable could not be identified", - "NoIntegerInductionVariable"); + "NoIntegerInductionVariable", ORE, TheLoop); return false; } else { LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); } } + // For first order recurrences, we use the previous value (incoming value from + // the latch) to check if it dominates all users of the recurrence. Bail out + // if we have to sink such an instruction for another recurrence, as the + // dominance requirement may not hold after sinking. + BasicBlock *LoopLatch = TheLoop->getLoopLatch(); + if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) { + Instruction *V = + cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch)); + return SinkAfter.find(V) != SinkAfter.end(); + })) + return false; + // Now we know the widest induction type, check if our found induction // is the same size. If it's not, unset it here and InnerLoopVectorizer // will create another. @@ -870,7 +851,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() { if (LAI->hasDependenceInvolvingLoopInvariantAddress()) { reportVectorizationFailure("Stores to a uniform address", "write to a loop invariant address could not be vectorized", - "CantVectorizeStoreToLoopInvariantAddress"); + "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop); return false; } Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks()); @@ -906,7 +887,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { } bool LoopVectorizationLegality::blockCanBePredicated( - BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) { + BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) { const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); for (Instruction &I : *BB) { @@ -925,7 +906,7 @@ bool LoopVectorizationLegality::blockCanBePredicated( // !llvm.mem.parallel_loop_access implies if-conversion safety. // Otherwise, record that the load needs (real or emulated) masking // and let the cost model decide. - if (!IsAnnotatedParallel) + if (!IsAnnotatedParallel || PreserveGuards) MaskedOp.insert(LI); continue; } @@ -954,23 +935,41 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!EnableIfConversion) { reportVectorizationFailure("If-conversion is disabled", "if-conversion is disabled", - "IfConversionDisabled"); + "IfConversionDisabled", + ORE, TheLoop); return false; } assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); - // A list of pointers that we can safely read and write to. + // A list of pointers which are known to be dereferenceable within scope of + // the loop body for each iteration of the loop which executes. That is, + // the memory pointed to can be dereferenced (with the access size implied by + // the value's type) unconditionally within the loop header without + // introducing a new fault. SmallPtrSet<Value *, 8> SafePointes; // Collect safe addresses. for (BasicBlock *BB : TheLoop->blocks()) { - if (blockNeedsPredication(BB)) + if (!blockNeedsPredication(BB)) { + for (Instruction &I : *BB) + if (auto *Ptr = getLoadStorePointerOperand(&I)) + SafePointes.insert(Ptr); continue; + } - for (Instruction &I : *BB) - if (auto *Ptr = getLoadStorePointerOperand(&I)) - SafePointes.insert(Ptr); + // For a block which requires predication, a address may be safe to access + // in the loop w/o predication if we can prove dereferenceability facts + // sufficient to ensure it'll never fault within the loop. For the moment, + // we restrict this to loads; stores are more complicated due to + // concurrency restrictions. + ScalarEvolution &SE = *PSE.getSE(); + for (Instruction &I : *BB) { + LoadInst *LI = dyn_cast<LoadInst>(&I); + if (LI && !mustSuppressSpeculation(*LI) && + isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT)) + SafePointes.insert(LI->getPointerOperand()); + } } // Collect the blocks that need predication. @@ -980,7 +979,8 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { if (!isa<BranchInst>(BB->getTerminator())) { reportVectorizationFailure("Loop contains a switch statement", "loop contains a switch statement", - "LoopContainsSwitch", BB->getTerminator()); + "LoopContainsSwitch", ORE, TheLoop, + BB->getTerminator()); return false; } @@ -990,14 +990,16 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", - "NoCFGForSelect", BB->getTerminator()); + "NoCFGForSelect", ORE, TheLoop, + BB->getTerminator()); return false; } } else if (BB != Header && !canIfConvertPHINodes(BB)) { reportVectorizationFailure( "Control flow cannot be substituted for a select", "control flow cannot be substituted for a select", - "NoCFGForSelect", BB->getTerminator()); + "NoCFGForSelect", ORE, TheLoop, + BB->getTerminator()); return false; } } @@ -1027,7 +1029,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, if (!Lp->getLoopPreheader()) { reportVectorizationFailure("Loop doesn't have a legal pre-header", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1038,7 +1040,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, if (Lp->getNumBackEdges() != 1) { reportVectorizationFailure("The loop must have a single backedge", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1049,7 +1051,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, if (!Lp->getExitingBlock()) { reportVectorizationFailure("The loop must have an exiting block", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1062,7 +1064,7 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp, if (Lp->getExitingBlock() != Lp->getLoopLatch()) { reportVectorizationFailure("The exiting block is not the loop latch", "loop control flow is not understood by vectorizer", - "CFGNotUnderstood"); + "CFGNotUnderstood", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1125,7 +1127,8 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { if (!canVectorizeOuterLoop()) { reportVectorizationFailure("Unsupported outer loop", "unsupported outer loop", - "UnsupportedOuterLoop"); + "UnsupportedOuterLoop", + ORE, TheLoop); // TODO: Implement DoExtraAnalysis when subsequent legal checks support // outer loops. return false; @@ -1177,7 +1180,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) { reportVectorizationFailure("Too many SCEV checks needed", "Too many SCEV assumptions need to be made and checked at runtime", - "TooManySCEVRunTimeChecks"); + "TooManySCEVRunTimeChecks", ORE, TheLoop); if (DoExtraAnalysis) Result = false; else @@ -1191,7 +1194,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) { return Result; } -bool LoopVectorizationLegality::canFoldTailByMasking() { +bool LoopVectorizationLegality::prepareToFoldTailByMasking() { LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n"); @@ -1200,22 +1203,21 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { "No primary induction, cannot fold tail by masking", "Missing a primary induction variable in the loop, which is " "needed in order to fold tail by masking as required.", - "NoPrimaryInduction"); + "NoPrimaryInduction", ORE, TheLoop); return false; } - // TODO: handle reductions when tail is folded by masking. - if (!Reductions.empty()) { - reportVectorizationFailure( - "Loop has reductions, cannot fold tail by masking", - "Cannot fold tail by masking in the presence of reductions.", - "ReductionFoldingTailByMasking"); - return false; - } + SmallPtrSet<const Value *, 8> ReductionLiveOuts; - // TODO: handle outside users when tail is folded by masking. + for (auto &Reduction : *getReductionVars()) + ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); + + // TODO: handle non-reduction outside users when tail is folded by masking. for (auto *AE : AllowedExit) { - // Check that all users of allowed exit values are inside the loop. + // Check that all users of allowed exit values are inside the loop or + // are the live-out of a reduction. + if (ReductionLiveOuts.count(AE)) + continue; for (User *U : AE->users()) { Instruction *UI = cast<Instruction>(U); if (TheLoop->contains(UI)) @@ -1223,7 +1225,7 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { reportVectorizationFailure( "Cannot fold tail by masking, loop has an outside user for", "Cannot fold tail by masking in the presence of live outs.", - "LiveOutFoldingTailByMasking", UI); + "LiveOutFoldingTailByMasking", ORE, TheLoop, UI); return false; } } @@ -1234,11 +1236,12 @@ bool LoopVectorizationLegality::canFoldTailByMasking() { // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers)) { + if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) { reportVectorizationFailure( "Cannot fold tail by masking as required", "control flow cannot be substituted for a select", - "NoCFGForSelect", BB->getTerminator()); + "NoCFGForSelect", ORE, TheLoop, + BB->getTerminator()); return false; } } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 97077cce83e3..e5edd305d3d5 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -201,6 +201,9 @@ class LoopVectorizationPlanner { /// The profitability analysis. LoopVectorizationCostModel &CM; + /// The interleaved access analysis. + InterleavedAccessInfo &IAI; + SmallVector<VPlanPtr, 4> VPlans; /// This class is used to enable the VPlan to invoke a method of ILV. This is @@ -211,6 +214,8 @@ class LoopVectorizationPlanner { VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {} Value *getOrCreateVectorValues(Value *V, unsigned Part) override; + Value *getOrCreateScalarValue(Value *V, + const VPIteration &Instance) override; }; /// A builder used to construct the current plan. @@ -223,16 +228,18 @@ public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM) {} + LoopVectorizationCostModel &CM, + InterleavedAccessInfo &IAI) + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + IAI(IAI) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. - Optional<VectorizationFactor> plan(bool OptForSize, unsigned UserVF); + Optional<VectorizationFactor> plan(unsigned UserVF); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(bool OptForSize, unsigned UserVF); + VectorizationFactor planInVPlanNativePath(unsigned UserVF); /// Finalize the best decision and dispose of all other VPlans. void setBestPlan(unsigned VF, unsigned UF); @@ -272,9 +279,10 @@ private: /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. - VPlanPtr - buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, - SmallPtrSetImpl<Instruction *> &DeadInstructions); + VPlanPtr buildVPlanWithVPRecipes( + VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, + SmallPtrSetImpl<Instruction *> &DeadInstructions, + const DenseMap<Instruction *, Instruction *> &SinkAfter); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 46265e3f3e13..ebfd5fe8b762 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -58,8 +58,8 @@ #include "VPRecipeBuilder.h" #include "VPlan.h" #include "VPlanHCFGBuilder.h" -#include "VPlanHCFGTransforms.h" #include "VPlanPredicator.h" +#include "VPlanTransforms.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -124,6 +124,7 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -149,7 +150,6 @@ #include <string> #include <tuple> #include <utility> -#include <vector> using namespace llvm; @@ -177,6 +177,14 @@ static cl::opt<unsigned> TinyTripCountVectorThreshold( "value are vectorized only if no scalar iteration overheads " "are incurred.")); +// Indicates that an epilogue is undesired, predication is preferred. +// This means that the vectorizer will try to fold the loop-tail (epilogue) +// into the loop and predicate the loop body accordingly. +static cl::opt<bool> PreferPredicateOverEpilog( + "prefer-predicate-over-epilog", cl::init(false), cl::Hidden, + cl::desc("Indicate that an epilogue is undesired, predication should be " + "used instead.")); + static cl::opt<bool> MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -192,9 +200,10 @@ static cl::opt<bool> EnableMaskedInterleavedMemAccesses( "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop")); -/// We don't interleave loops with a known constant trip count below this -/// number. -static const unsigned TinyTripCountInterleaveThreshold = 128; +static cl::opt<unsigned> TinyTripCountInterleaveThreshold( + "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden, + cl::desc("We don't interleave loops with a estimated constant trip count " + "below this number")); static cl::opt<unsigned> ForceTargetNumScalarRegs( "force-target-num-scalar-regs", cl::init(0), cl::Hidden, @@ -347,6 +356,29 @@ static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { : ConstantFP::get(Ty, C); } +/// Returns "best known" trip count for the specified loop \p L as defined by +/// the following procedure: +/// 1) Returns exact trip count if it is known. +/// 2) Returns expected trip count according to profile data if any. +/// 3) Returns upper bound estimate if it is known. +/// 4) Returns None if all of the above failed. +static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { + // Check if exact trip count is known. + if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) + return ExpectedTC; + + // Check if there is an expected trip count available from profile data. + if (LoopVectorizeWithBlockFrequency) + if (auto EstimatedTC = getLoopEstimatedTripCount(L)) + return EstimatedTC; + + // Check if upper bound estimate is known. + if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) + return ExpectedTC; + + return None; +} + namespace llvm { /// InnerLoopVectorizer vectorizes loops which contain only one basic @@ -396,6 +428,11 @@ public: /// new unrolled loop, where UF is the unroll factor. using VectorParts = SmallVector<Value *, 2>; + /// Vectorize a single GetElementPtrInst based on information gathered and + /// decisions taken during planning. + void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF, + bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant); + /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -445,15 +482,20 @@ public: /// Construct the vector value of a scalarized value \p V one lane at a time. void packScalarIntoVectorValue(Value *V, const VPIteration &Instance); - /// Try to vectorize the interleaved access group that \p Instr belongs to, - /// optionally masking the vector operations if \p BlockInMask is non-null. - void vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask = nullptr); - - /// Vectorize Load and Store instructions, optionally masking the vector - /// operations if \p BlockInMask is non-null. - void vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask = nullptr); + /// Try to vectorize the interleaved access group that \p Instr belongs to + /// with the base address given in \p Addr, optionally masking the vector + /// operations if \p BlockInMask is non-null. Use \p State to translate given + /// VPValues to IR values in the vectorized loop. + void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State, + VPValue *Addr, VPValue *BlockInMask = nullptr); + + /// Vectorize Load and Store instructions with the base address given in \p + /// Addr, optionally masking the vector operations if \p BlockInMask is + /// non-null. Use \p State to translate given VPValues to IR values in the + /// vectorized loop. + void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask = nullptr); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -494,6 +536,9 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); + /// Clear NSW/NUW flags from reduction instructions if necessary. + void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc); + /// The Loop exit block may have single value PHI nodes with some /// incoming value. While vectorizing we only handled real values /// that were defined inside the loop and we should have one value for @@ -508,10 +553,6 @@ protected: /// represented as. void truncateToMinimalBitwidths(); - /// Insert the new loop to the loop hierarchy and pass manager - /// and update the analysis passes. - void updateAnalysis(); - /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -795,6 +836,59 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) B.SetCurrentDebugLocation(DebugLoc()); } +/// Write a record \p DebugMsg about vectorization failure to the debug +/// output stream. If \p I is passed, it is an instruction that prevents +/// vectorization. +#ifndef NDEBUG +static void debugVectorizationFailure(const StringRef DebugMsg, + Instruction *I) { + dbgs() << "LV: Not vectorizing: " << DebugMsg; + if (I != nullptr) + dbgs() << " " << *I; + else + dbgs() << '.'; + dbgs() << '\n'; +} +#endif + +/// Create an analysis remark that explains why vectorization failed +/// +/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p +/// RemarkName is the identifier for the remark. If \p I is passed it is an +/// instruction that prevents vectorization. Otherwise \p TheLoop is used for +/// the location of the remark. \return the remark object that can be +/// streamed to. +static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, + StringRef RemarkName, Loop *TheLoop, Instruction *I) { + Value *CodeRegion = TheLoop->getHeader(); + DebugLoc DL = TheLoop->getStartLoc(); + + if (I) { + CodeRegion = I->getParent(); + // If there is no debug location attached to the instruction, revert back to + // using the loop's. + if (I->getDebugLoc()) + DL = I->getDebugLoc(); + } + + OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion); + R << "loop not vectorized: "; + return R; +} + +namespace llvm { + +void reportVectorizationFailure(const StringRef DebugMsg, + const StringRef OREMsg, const StringRef ORETag, + OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { + LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I)); + LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE); + ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), + ORETag, TheLoop, I) << OREMsg); +} + +} // end namespace llvm + #ifndef NDEBUG /// \return string containing a file name and a line # for the given loop. static std::string getDebugLocString(const Loop *L) { @@ -836,6 +930,26 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To, namespace llvm { +// Loop vectorization cost-model hints how the scalar epilogue loop should be +// lowered. +enum ScalarEpilogueLowering { + + // The default: allowing scalar epilogues. + CM_ScalarEpilogueAllowed, + + // Vectorization with OptForSize: don't allow epilogues. + CM_ScalarEpilogueNotAllowedOptSize, + + // A special case of vectorisation with OptForSize: loops with a very small + // trip count are considered for vectorization under OptForSize, thereby + // making sure the cost of their loop body is dominant, free of runtime + // guards and scalar iteration overheads. + CM_ScalarEpilogueNotAllowedLowTripLoop, + + // Loop hint predicate indicating an epilogue is undesired. + CM_ScalarEpilogueNotNeededUsePredicate +}; + /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -845,20 +959,26 @@ namespace llvm { /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE, - LoopInfo *LI, LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, + PredicatedScalarEvolution &PSE, LoopInfo *LI, + LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) - : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), - AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} + : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), + TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), + Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factor, or None if /// vectorization and interleaving should be avoided up front. - Optional<unsigned> computeMaxVF(bool OptForSize); + Optional<unsigned> computeMaxVF(); + + /// \return True if runtime checks are required for vectorization, and false + /// otherwise. + bool runtimeChecksRequired(); /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to MaxVF. If UserVF is not ZERO @@ -881,8 +1001,7 @@ public: /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(bool OptForSize, unsigned VF, - unsigned LoopCost); + unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -897,10 +1016,11 @@ public: /// of a loop. struct RegisterUsage { /// Holds the number of loop invariant values that are used in the loop. - unsigned LoopInvariantRegs; - + /// The key is ClassID of target-provided register class. + SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs; /// Holds the maximum number of concurrent live intervals in the loop. - unsigned MaxLocalUsers; + /// The key is ClassID of target-provided register class. + SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers; }; /// \return Returns information about the register usages of the loop for the @@ -1080,26 +1200,28 @@ public: /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); + bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + return Legal->isConsecutivePtr(Ptr) && + TTI.isLegalMaskedStore(DataType, Alignment); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); + bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) { + return Legal->isConsecutivePtr(Ptr) && + TTI.isLegalMaskedLoad(DataType, Alignment); } /// Returns true if the target machine supports masked scatter operation /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType) { - return TTI.isLegalMaskedScatter(DataType); + bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) { + return TTI.isLegalMaskedScatter(DataType, Alignment); } /// Returns true if the target machine supports masked gather operation /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType) { - return TTI.isLegalMaskedGather(DataType); + bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { + return TTI.isLegalMaskedGather(DataType, Alignment); } /// Returns true if the target machine can represent \p V as a masked gather @@ -1110,7 +1232,9 @@ public: if (!LI && !SI) return false; auto *Ty = getMemInstValueType(V); - return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + MaybeAlign Align = getLoadStoreAlignment(V); + return (LI && isLegalMaskedGather(Ty, Align)) || + (SI && isLegalMaskedScatter(Ty, Align)); } /// Returns true if \p I is an instruction that will be scalarized with @@ -1157,11 +1281,14 @@ public: /// to handle accesses with gaps, and there is nothing preventing us from /// creating a scalar epilogue. bool requiresScalarEpilogue() const { - return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue(); + return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue(); } - /// Returns true if a scalar epilogue is not allowed due to optsize. - bool isScalarEpilogueAllowed() const { return IsScalarEpilogueAllowed; } + /// Returns true if a scalar epilogue is not allowed due to optsize or a + /// loop hint annotation. + bool isScalarEpilogueAllowed() const { + return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; + } /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } @@ -1187,7 +1314,7 @@ private: /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. - unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); + unsigned computeFeasibleMaxVF(unsigned ConstTripCount); /// The vectorization cost is a combination of the cost itself and a boolean /// indicating whether any of the contributing operations will actually @@ -1246,15 +1373,6 @@ private: /// should be used. bool useEmulatedMaskMemRefHack(Instruction *I); - /// Create an analysis remark that explains why vectorization failed - /// - /// \p RemarkName is the identifier for the remark. \return the remark object - /// that can be streamed to. - OptimizationRemarkAnalysis createMissedAnalysis(StringRef RemarkName) { - return createLVMissedAnalysis(Hints->vectorizeAnalysisPassName(), - RemarkName, TheLoop); - } - /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated /// to this type. @@ -1270,13 +1388,13 @@ private: SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization; /// Records whether it is allowed to have the original scalar loop execute at - /// least once. This may be needed as a fallback loop in case runtime + /// least once. This may be needed as a fallback loop in case runtime /// aliasing/dependence checks fail, or to handle the tail/remainder /// iterations when the trip count is unknown or doesn't divide by the VF, /// or as a peel-loop to handle gaps in interleave-groups. /// Under optsize and when the trip count is very small we don't allow any /// iterations to execute in the scalar loop. - bool IsScalarEpilogueAllowed = true; + ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; @@ -1496,7 +1614,7 @@ struct LoopVectorize : public FunctionPass { auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); @@ -2049,7 +2167,9 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) { // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements // store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { const InterleaveGroup<Instruction> *Group = Cost->getInterleavedAccessGroup(Instr); assert(Group && "Fail to get an interleaved access group."); @@ -2059,27 +2179,19 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, return; const DataLayout &DL = Instr->getModule()->getDataLayout(); - Value *Ptr = getLoadStorePointerOperand(Instr); // Prepare for the vector type of the interleaved load/store. Type *ScalarTy = getMemInstValueType(Instr); unsigned InterleaveFactor = Group->getFactor(); Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF); - Type *PtrTy = VecTy->getPointerTo(getLoadStoreAddressSpace(Instr)); // Prepare for the new pointers. - setDebugLocFromInst(Builder, Ptr); - SmallVector<Value *, 2> NewPtrs; + SmallVector<Value *, 2> AddrParts; unsigned Index = Group->getIndex(Instr); - VectorParts Mask; - bool IsMaskForCondRequired = BlockInMask; - if (IsMaskForCondRequired) { - Mask = *BlockInMask; - // TODO: extend the masked interleaved-group support to reversed access. - assert(!Group->isReverse() && "Reversed masked interleave-group " - "not supported."); - } + // TODO: extend the masked interleaved-group support to reversed access. + assert((!BlockInMask || !Group->isReverse()) && + "Reversed masked interleave-group not supported."); // If the group is reverse, adjust the index to refer to the last vector lane // instead of the first. We adjust the index from the first vector lane, @@ -2090,12 +2202,9 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, if (Group->isReverse()) Index += (VF - 1) * Group->getFactor(); - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - for (unsigned Part = 0; Part < UF; Part++) { - Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0}); + Value *AddrPart = State.get(Addr, {Part, 0}); + setDebugLocFromInst(Builder, AddrPart); // Notice current instruction could be any index. Need to adjust the address // to the member of index 0. @@ -2108,12 +2217,17 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, // A[i] = b; // Member of index 0 // A[i+2] = c; // Member of index 2 (Current instruction) // Current pointer is pointed to A[i+2], adjust it to A[i]. - NewPtr = Builder.CreateGEP(ScalarTy, NewPtr, Builder.getInt32(-Index)); - if (InBounds) - cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true); + + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts())) + InBounds = gep->isInBounds(); + AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index)); + cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds); // Cast to the vector pointer type. - NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); + unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace(); + Type *PtrTy = VecTy->getPointerTo(AddressSpace); + AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy)); } setDebugLocFromInst(Builder, Instr); @@ -2131,26 +2245,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, SmallVector<Value *, 2> NewLoads; for (unsigned Part = 0; Part < UF; Part++) { Instruction *NewLoad; - if (IsMaskForCondRequired || MaskForGaps) { + if (BlockInMask || MaskForGaps) { assert(useMaskedInterleavedAccesses(*TTI) && "masked interleaved groups are not allowed."); Value *GroupMask = MaskForGaps; - if (IsMaskForCondRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And, ShuffledMask, MaskForGaps) : ShuffledMask; } NewLoad = - Builder.CreateMaskedLoad(NewPtrs[Part], Group->getAlignment(), + Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(), GroupMask, UndefVec, "wide.masked.vec"); } else - NewLoad = Builder.CreateAlignedLoad(VecTy, NewPtrs[Part], + NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part], Group->getAlignment(), "wide.vec"); Group->addMetadata(NewLoad); NewLoads.push_back(NewLoad); @@ -2219,24 +2334,27 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr, "interleaved.vec"); Instruction *NewStoreInstr; - if (IsMaskForCondRequired) { - auto *Undefs = UndefValue::get(Mask[Part]->getType()); + if (BlockInMask) { + Value *BlockInMaskPart = State.get(BlockInMask, Part); + auto *Undefs = UndefValue::get(BlockInMaskPart->getType()); auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF); Value *ShuffledMask = Builder.CreateShuffleVector( - Mask[Part], Undefs, RepMask, "interleaved.mask"); + BlockInMaskPart, Undefs, RepMask, "interleaved.mask"); NewStoreInstr = Builder.CreateMaskedStore( - IVec, NewPtrs[Part], Group->getAlignment(), ShuffledMask); + IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask); } else - NewStoreInstr = Builder.CreateAlignedStore(IVec, NewPtrs[Part], - Group->getAlignment()); + NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part], + Group->getAlignment()); Group->addMetadata(NewStoreInstr); } } void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, - VectorParts *BlockInMask) { + VPTransformState &State, + VPValue *Addr, + VPValue *BlockInMask) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast<LoadInst>(Instr); StoreInst *SI = dyn_cast<StoreInst>(Instr); @@ -2248,18 +2366,15 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point"); if (Decision == LoopVectorizationCostModel::CM_Interleave) - return vectorizeInterleaveGroup(Instr); + return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask); Type *ScalarDataTy = getMemInstValueType(Instr); Type *DataTy = VectorType::get(ScalarDataTy, VF); - Value *Ptr = getLoadStorePointerOperand(Instr); - unsigned Alignment = getLoadStoreAlignment(Instr); // An alignment of 0 means target abi alignment. We need to use the scalar's // target abi alignment in such a case. const DataLayout &DL = Instr->getModule()->getDataLayout(); - if (!Alignment) - Alignment = DL.getABITypeAlignment(ScalarDataTy); - unsigned AddressSpace = getLoadStoreAddressSpace(Instr); + const Align Alignment = + DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2273,25 +2388,22 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // gather/scatter. Otherwise Decision should have been to Scalarize. assert((ConsecutiveStride || CreateGatherScatter) && "The instruction should be scalarized"); + (void)ConsecutiveStride; - // Handle consecutive loads/stores. - if (ConsecutiveStride) - Ptr = getOrCreateScalarValue(Ptr, {0, 0}); - - VectorParts Mask; + VectorParts BlockInMaskParts(UF); bool isMaskRequired = BlockInMask; if (isMaskRequired) - Mask = *BlockInMask; - - bool InBounds = false; - if (auto *gep = dyn_cast<GetElementPtrInst>( - getLoadStorePointerOperand(Instr)->stripPointerCasts())) - InBounds = gep->isInBounds(); + for (unsigned Part = 0; Part < UF; ++Part) + BlockInMaskParts[Part] = State.get(BlockInMask, Part); const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. GetElementPtrInst *PartPtr = nullptr; + bool InBounds = false; + if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (Reverse) { // If the address is consecutive but reversed, then the // wide store needs to start at the last vector element. @@ -2302,13 +2414,14 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF))); PartPtr->setIsInBounds(InBounds); if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - Mask[Part] = reverseVector(Mask[Part]); + BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]); } else { PartPtr = cast<GetElementPtrInst>( Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF))); PartPtr->setIsInBounds(InBounds); } + unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace(); return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace)); }; @@ -2320,10 +2433,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, Instruction *NewSI = nullptr; Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part); if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(Addr, Part); + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, + Alignment.value(), MaskPart); } else { if (Reverse) { // If we store to reverse consecutive memory locations, then we need @@ -2332,12 +2445,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - Mask[Part]); + NewSI = Builder.CreateMaskedStore( + StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]); else - NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + NewSI = + Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value()); } addMetadata(NewSI, SI); } @@ -2350,20 +2464,20 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, for (unsigned Part = 0; Part < UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? Mask[Part] : nullptr; - Value *VectorGep = getOrCreateVectorValue(Ptr, Part); - NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart, + Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *VectorGep = State.get(Addr, Part); + NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart, nullptr, "wide.masked.gather"); addMetadata(NewLI, LI); } else { - auto *VecPtr = CreateVecPtr(Part, Ptr); + auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0})); if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad(VecPtr, Alignment, Mask[Part], - UndefValue::get(DataTy), - "wide.masked.load"); + NewLI = Builder.CreateMaskedLoad( + VecPtr, Alignment.value(), BlockInMaskParts[Part], + UndefValue::get(DataTy), "wide.masked.load"); else - NewLI = - Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(), + "wide.load"); // Add metadata to the load, but setVectorValue to the reverse shuffle. addMetadata(NewLI, LI); @@ -2570,8 +2684,10 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy, void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass) { Value *Count = getOrCreateTripCount(L); - BasicBlock *BB = L->getLoopPreheader(); - IRBuilder<> Builder(BB->getTerminator()); + // Reuse existing vector loop preheader for TC checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const TCCheckBlock = LoopVectorPreHeader; + IRBuilder<> Builder(TCCheckBlock->getTerminator()); // Generate code to check if the loop's trip count is less than VF * UF, or // equal to it in case a scalar epilogue is required; this implies that the @@ -2588,47 +2704,61 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, P, Count, ConstantInt::get(Count->getType(), VF * UF), "min.iters.check"); - BasicBlock *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, CheckMinIters)); - LoopBypassBlocks.push_back(BB); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, + "vector.ph"); + + assert(DT->properlyDominates(DT->getNode(TCCheckBlock), + DT->getNode(Bypass)->getIDom()) && + "TC check is expected to dominate Bypass"); + + // Update dominator for Bypass & LoopExit. + DT->changeImmediateDominator(Bypass, TCCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock); + + ReplaceInstWithInst( + TCCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters)); + LoopBypassBlocks.push_back(TCCheckBlock); } void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for SCEV checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; // Generate the code to check that the SCEV assumptions that we made. // We want the new basic block to start at the first instruction in a // sequence of instructions that form a check. SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), "scev.check"); - Value *SCEVCheck = - Exp.expandCodeForPredicate(&PSE.getUnionPredicate(), BB->getTerminator()); + Value *SCEVCheck = Exp.expandCodeForPredicate( + &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) if (C->isZero()) return; - assert(!Cost->foldTailByMasking() && - "Cannot SCEV check stride or overflow when folding tail"); - // Create a new block containing the stride check. - BB->setName("vector.scevcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, SCEVCheck)); - LoopBypassBlocks.push_back(BB); + assert(!SCEVCheckBlock->getParent()->hasOptSize() && + "Cannot SCEV check stride or overflow when optimizing for size"); + + SCEVCheckBlock->setName("vector.scevcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, + nullptr, "vector.ph"); + + // Update dominator only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, SCEVCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock); + } + + ReplaceInstWithInst( + SCEVCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; } @@ -2637,7 +2767,9 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { if (EnableVPlanNativePath) return; - BasicBlock *BB = L->getLoopPreheader(); + // Reuse existing vector loop preheader for runtime memory checks. + // Note that new preheader block is generated for vector loop. + BasicBlock *const MemCheckBlock = L->getLoopPreheader(); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements @@ -2645,29 +2777,46 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { Instruction *FirstCheckInst; Instruction *MemRuntimeCheck; std::tie(FirstCheckInst, MemRuntimeCheck) = - Legal->getLAI()->addRuntimeChecks(BB->getTerminator()); + Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator()); if (!MemRuntimeCheck) return; - assert(!Cost->foldTailByMasking() && "Cannot check memory when folding tail"); - // Create a new block containing the memory check. - BB->setName("vector.memcheck"); - auto *NewBB = BB->splitBasicBlock(BB->getTerminator(), "vector.ph"); - // Update dominator tree immediately if the generated block is a - // LoopBypassBlock because SCEV expansions to generate loop bypass - // checks may query it before the current function is finished. - DT->addNewBlock(NewBB, BB); - if (L->getParentLoop()) - L->getParentLoop()->addBasicBlockToLoop(NewBB, *LI); - ReplaceInstWithInst(BB->getTerminator(), - BranchInst::Create(Bypass, NewBB, MemRuntimeCheck)); - LoopBypassBlocks.push_back(BB); + if (MemCheckBlock->getParent()->hasOptSize()) { + assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled && + "Cannot emit memory checks when optimizing for size, unless forced " + "to vectorize."); + ORE->emit([&]() { + return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize", + L->getStartLoc(), L->getHeader()) + << "Code-size may be reduced by not forcing " + "vectorization, or by source-code modifications " + "eliminating the need for runtime checks " + "(e.g., adding 'restrict')."; + }); + } + + MemCheckBlock->setName("vector.memcheck"); + // Create new preheader for vector loop. + LoopVectorPreHeader = + SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, + "vector.ph"); + + // Update dominator only if this is first RT check. + if (LoopBypassBlocks.empty()) { + DT->changeImmediateDominator(Bypass, MemCheckBlock); + DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); + } + + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck)); + LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. - LVer = llvm::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, - PSE.getSE()); + LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT, + PSE.getSE()); LVer->prepareNoAliasMetadata(); } @@ -2792,12 +2941,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { ... */ - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - BasicBlock *VectorPH = OrigLoop->getLoopPreheader(); - BasicBlock *ExitBlock = OrigLoop->getExitBlock(); MDNode *OrigLoopID = OrigLoop->getLoopID(); - assert(VectorPH && "Invalid loop structure"); - assert(ExitBlock && "Must have an exit block"); // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer @@ -2814,12 +2958,27 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { Type *IdxTy = Legal->getWidestInductionType(); // Split the single block loop into the two loop structure described above. - BasicBlock *VecBody = - VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); - BasicBlock *MiddleBlock = - VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + LoopScalarBody = OrigLoop->getHeader(); + LoopVectorPreHeader = OrigLoop->getLoopPreheader(); + LoopExitBlock = OrigLoop->getExitBlock(); + assert(LoopExitBlock && "Must have an exit block"); + assert(LoopVectorPreHeader && "Invalid loop structure"); + + LoopMiddleBlock = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + LI, nullptr, "middle.block"); + LoopScalarPreHeader = + SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI, + nullptr, "scalar.ph"); + // We intentionally don't let SplitBlock to update LoopInfo since + // LoopVectorBody should belong to another loop than LoopVectorPreHeader. + // LoopVectorBody is explicitly added to the correct place few lines later. + LoopVectorBody = + SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, + nullptr, nullptr, "vector.body"); + + // Update dominator for loop exit. + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); // Create and register the new vector loop. Loop *Lp = LI->AllocateLoop(); @@ -2829,12 +2988,10 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // before calling any utilities such as SCEV that require valid LoopInfo. if (ParentLoop) { ParentLoop->addChildLoop(Lp); - ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); - ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); } else { LI->addTopLevelLoop(Lp); } - Lp->addBasicBlockToLoop(VecBody, *LI); + Lp->addBasicBlockToLoop(LoopVectorBody, *LI); // Find the loop boundaries. Value *Count = getOrCreateTripCount(Lp); @@ -2846,16 +3003,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitMinimumIterationCountCheck(Lp, ScalarPH); + emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader); // Generate the code to check any assumptions that we've made for SCEV // expressions. - emitSCEVChecks(Lp, ScalarPH); + emitSCEVChecks(Lp, LoopScalarPreHeader); // Generate the code that checks in runtime if arrays overlap. We put the // checks into a separate block to make the more common case of few elements // faster. - emitMemRuntimeChecks(Lp, ScalarPH); + emitMemRuntimeChecks(Lp, LoopScalarPreHeader); // Generate the induction variable. // The loop step is equal to the vectorization factor (num of SIMD elements) @@ -2883,8 +3040,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { InductionDescriptor II = InductionEntry.second; // Create phi nodes to merge from the backedge-taken check block. - PHINode *BCResumeVal = PHINode::Create( - OrigPhi->getType(), 3, "bc.resume.val", ScalarPH->getTerminator()); + PHINode *BCResumeVal = + PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val", + LoopScalarPreHeader->getTerminator()); // Copy original phi DL over to the new one. BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc()); Value *&EndValue = IVEndValues[OrigPhi]; @@ -2895,23 +3053,23 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { IRBuilder<> B(Lp->getLoopPreheader()->getTerminator()); Type *StepType = II.getStep()->getType(); Instruction::CastOps CastOp = - CastInst::getCastOpcode(CountRoundDown, true, StepType, true); + CastInst::getCastOpcode(CountRoundDown, true, StepType, true); Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd"); - const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout(); + const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout(); EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II); EndValue->setName("ind.end"); } // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - BCResumeVal->addIncoming(EndValue, MiddleBlock); + BCResumeVal->addIncoming(EndValue, LoopMiddleBlock); // Fix the scalar body counter (PHI node). // The old induction's phi node in the scalar body needs the truncated // value. for (BasicBlock *BB : LoopBypassBlocks) BCResumeVal->addIncoming(II.getStartValue(), BB); - OrigPhi->setIncomingValueForBlock(ScalarPH, BCResumeVal); + OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } // We need the OrigLoop (scalar loop part) latch terminator to help @@ -2929,9 +3087,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // If tail is to be folded, we know we don't need to run the remainder. Value *CmpN = Builder.getTrue(); if (!Cost->foldTailByMasking()) { - CmpN = - CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", MiddleBlock->getTerminator()); + CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, + CountRoundDown, "cmp.n", + LoopMiddleBlock->getTerminator()); // Here we use the same DebugLoc as the scalar loop latch branch instead // of the corresponding compare because they may have ended up with @@ -2940,20 +3098,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc()); } - BranchInst *BrInst = BranchInst::Create(ExitBlock, ScalarPH, CmpN); + BranchInst *BrInst = + BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN); BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc()); - ReplaceInstWithInst(MiddleBlock->getTerminator(), BrInst); + ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst); // Get ready to start creating new instructions into the vectorized body. - Builder.SetInsertPoint(&*VecBody->getFirstInsertionPt()); - - // Save the state. - LoopVectorPreHeader = Lp->getLoopPreheader(); - LoopScalarPreHeader = ScalarPH; - LoopMiddleBlock = MiddleBlock; - LoopExitBlock = ExitBlock; - LoopVectorBody = VecBody; - LoopScalarBody = OldBasicBlock; + assert(LoopVectorPreHeader == Lp->getLoopPreheader() && + "Inconsistent vector loop preheader"); + Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); Optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll, @@ -2974,6 +3127,11 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { LoopVectorizeHints Hints(Lp, true, *ORE); Hints.setAlreadyVectorized(); +#ifdef EXPENSIVE_CHECKS + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); + LI->verify(*DT); +#endif + return LoopVectorPreHeader; } @@ -3309,15 +3467,8 @@ void InnerLoopVectorizer::fixVectorizedLoop() { // This is the second stage of vectorizing recurrences. fixCrossIterationPHIs(); - // Update the dominator tree. - // - // FIXME: After creating the structure of the new loop, the dominator tree is - // no longer up-to-date, and it remains that way until we update it - // here. An out-of-date dominator tree is problematic for SCEV, - // because SCEVExpander uses it to guide code generation. The - // vectorizer use SCEVExpanders in several places. Instead, we should - // keep the dominator tree up-to-date as we go. - updateAnalysis(); + // Forget the original basic block. + PSE.getSE()->forgetLoop(OrigLoop); // Fix-up external users of the induction variables. for (auto &Entry : *Legal->getInductionVars()) @@ -3430,17 +3581,27 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // among all unrolled iterations, due to the order of their construction. Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1); - // Set the insertion point after the previous value if it is an instruction. + // Find and set the insertion point after the previous value if it is an + // instruction. + BasicBlock::iterator InsertPt; // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. Also, if the previous - // value is a phi node, we should insert after all the phi nodes to avoid - // breaking basic block verification. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart) || - isa<PHINode>(PreviousLastPart)) - Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); - else - Builder.SetInsertPoint( - &*++BasicBlock::iterator(cast<Instruction>(PreviousLastPart))); + // guaranteed to be an instruction in the vector loop. + // FIXME: Loop invariant values do not form recurrences. We should deal with + // them earlier. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart)) + InsertPt = LoopVectorBody->getFirstInsertionPt(); + else { + Instruction *PreviousInst = cast<Instruction>(PreviousLastPart); + if (isa<PHINode>(PreviousLastPart)) + // If the previous value is a phi node, we should insert after all the phi + // nodes in the block containing the PHI to avoid breaking basic block + // verification. Note that the basic block may be different to + // LoopVectorBody, in case we predicate the loop. + InsertPt = PreviousInst->getParent()->getFirstInsertionPt(); + else + InsertPt = ++PreviousInst->getIterator(); + } + Builder.SetInsertPoint(&*InsertPt); // We will construct a vector for the recurrence by combining the values for // the current and previous iterations. This is the required shuffle mask. @@ -3573,16 +3734,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } } + // Wrap flags are in general invalid after vectorization, clear them. + clearReductionWrapFlags(RdxDesc); + // Fix the vector-loop phi. // Reductions do not have to start at zero. They can start with // any loop invariant values. BasicBlock *Latch = OrigLoop->getLoopLatch(); Value *LoopVal = Phi->getIncomingValueForBlock(Latch); + for (unsigned Part = 0; Part < UF; ++Part) { Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part); Value *Val = getOrCreateVectorValue(LoopVal, Part); - // Make sure to add the reduction stat value only to the + // Make sure to add the reduction start value only to the // first unroll part. Value *StartVal = (Part == 0) ? VectorStart : Identity; cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader); @@ -3598,6 +3763,26 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { setDebugLocFromInst(Builder, LoopExitInst); + // If tail is folded by masking, the vector value to leave the loop should be + // a Select choosing between the vectorized LoopExitInst and vectorized Phi, + // instead of the former. + if (Cost->foldTailByMasking()) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *VecLoopExitInst = + VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *Sel = nullptr; + for (User *U : VecLoopExitInst->users()) { + if (isa<SelectInst>(U)) { + assert(!Sel && "Reduction exit feeding two selects"); + Sel = U; + } else + assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select"); + } + assert(Sel && "Reduction exit feeds no select"); + VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + } + } + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. @@ -3699,6 +3884,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } +void InnerLoopVectorizer::clearReductionWrapFlags( + RecurrenceDescriptor &RdxDesc) { + RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind(); + if (RK != RecurrenceDescriptor::RK_IntegerAdd && + RK != RecurrenceDescriptor::RK_IntegerMult) + return; + + Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr(); + assert(LoopExitInstr && "null loop exit instruction"); + SmallVector<Instruction *, 8> Worklist; + SmallPtrSet<Instruction *, 8> Visited; + Worklist.push_back(LoopExitInstr); + Visited.insert(LoopExitInstr); + + while (!Worklist.empty()) { + Instruction *Cur = Worklist.pop_back_val(); + if (isa<OverflowingBinaryOperator>(Cur)) + for (unsigned Part = 0; Part < UF; ++Part) { + Value *V = getOrCreateVectorValue(Cur, Part); + cast<Instruction>(V)->dropPoisonGeneratingFlags(); + } + + for (User *U : Cur->users()) { + Instruction *UI = cast<Instruction>(U); + if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) && + Visited.insert(UI).second) + Worklist.push_back(UI); + } + } +} + void InnerLoopVectorizer::fixLCSSAPHIs() { for (PHINode &LCSSAPhi : LoopExitBlock->phis()) { if (LCSSAPhi.getNumIncomingValues() == 1) { @@ -3820,6 +4036,75 @@ void InnerLoopVectorizer::fixNonInductionPHIs() { } } +void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF, + unsigned VF, bool IsPtrLoopInvariant, + SmallBitVector &IsIndexLoopInvariant) { + // Construct a vector GEP by widening the operands of the scalar GEP as + // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP + // results in a vector of pointers when at least one operand of the GEP + // is vector-typed. Thus, to keep the representation compact, we only use + // vector-typed operands for loop-varying values. + + if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) { + // If we are vectorizing, but the GEP has only loop-invariant operands, + // the GEP we build (by only using vector-typed operands for + // loop-varying values) would be a scalar pointer. Thus, to ensure we + // produce a vector of pointers, we need to either arbitrarily pick an + // operand to broadcast, or broadcast a clone of the original GEP. + // Here, we broadcast a clone of the original. + // + // TODO: If at some point we decide to scalarize instructions having + // loop-invariant operands, this special case will no longer be + // required. We would add the scalarization decision to + // collectLoopScalars() and teach getVectorValue() to broadcast + // the lane-zero scalar value. + auto *Clone = Builder.Insert(GEP->clone()); + for (unsigned Part = 0; Part < UF; ++Part) { + Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); + VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart); + addMetadata(EntryPart, GEP); + } + } else { + // If the GEP has at least one loop-varying operand, we are sure to + // produce a vector of pointers. But if we are only unrolling, we want + // to produce a scalar GEP for each unroll part. Thus, the GEP we + // produce with the code below will be scalar (if VF == 1) or vector + // (otherwise). Note that for the unroll-only case, we still maintain + // values in the vector mapping with initVector, as we do for other + // instructions. + for (unsigned Part = 0; Part < UF; ++Part) { + // The pointer operand of the new GEP. If it's loop-invariant, we + // won't broadcast it. + auto *Ptr = IsPtrLoopInvariant + ? GEP->getPointerOperand() + : getOrCreateVectorValue(GEP->getPointerOperand(), Part); + + // Collect all the indices for the new GEP. If any index is + // loop-invariant, we won't broadcast it. + SmallVector<Value *, 4> Indices; + for (auto Index : enumerate(GEP->indices())) { + Value *User = Index.value().get(); + if (IsIndexLoopInvariant[Index.index()]) + Indices.push_back(User); + else + Indices.push_back(getOrCreateVectorValue(User, Part)); + } + + // Create the new GEP. Note that this GEP may be a scalar if VF == 1, + // but it should be a vector, otherwise. + auto *NewGEP = + GEP->isInBounds() + ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, + Indices) + : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); + assert((VF == 1 || NewGEP->getType()->isVectorTy()) && + "NewGEP is not a pointer vector"); + VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP); + addMetadata(NewGEP, GEP); + } + } +} + void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF) { PHINode *P = cast<PHINode>(PN); @@ -3922,76 +4207,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { case Instruction::Br: case Instruction::PHI: + case Instruction::GetElementPtr: llvm_unreachable("This instruction is handled by a different recipe."); - case Instruction::GetElementPtr: { - // Construct a vector GEP by widening the operands of the scalar GEP as - // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP - // results in a vector of pointers when at least one operand of the GEP - // is vector-typed. Thus, to keep the representation compact, we only use - // vector-typed operands for loop-varying values. - auto *GEP = cast<GetElementPtrInst>(&I); - - if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) { - // If we are vectorizing, but the GEP has only loop-invariant operands, - // the GEP we build (by only using vector-typed operands for - // loop-varying values) would be a scalar pointer. Thus, to ensure we - // produce a vector of pointers, we need to either arbitrarily pick an - // operand to broadcast, or broadcast a clone of the original GEP. - // Here, we broadcast a clone of the original. - // - // TODO: If at some point we decide to scalarize instructions having - // loop-invariant operands, this special case will no longer be - // required. We would add the scalarization decision to - // collectLoopScalars() and teach getVectorValue() to broadcast - // the lane-zero scalar value. - auto *Clone = Builder.Insert(GEP->clone()); - for (unsigned Part = 0; Part < UF; ++Part) { - Value *EntryPart = Builder.CreateVectorSplat(VF, Clone); - VectorLoopValueMap.setVectorValue(&I, Part, EntryPart); - addMetadata(EntryPart, GEP); - } - } else { - // If the GEP has at least one loop-varying operand, we are sure to - // produce a vector of pointers. But if we are only unrolling, we want - // to produce a scalar GEP for each unroll part. Thus, the GEP we - // produce with the code below will be scalar (if VF == 1) or vector - // (otherwise). Note that for the unroll-only case, we still maintain - // values in the vector mapping with initVector, as we do for other - // instructions. - for (unsigned Part = 0; Part < UF; ++Part) { - // The pointer operand of the new GEP. If it's loop-invariant, we - // won't broadcast it. - auto *Ptr = - OrigLoop->isLoopInvariant(GEP->getPointerOperand()) - ? GEP->getPointerOperand() - : getOrCreateVectorValue(GEP->getPointerOperand(), Part); - - // Collect all the indices for the new GEP. If any index is - // loop-invariant, we won't broadcast it. - SmallVector<Value *, 4> Indices; - for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) { - if (OrigLoop->isLoopInvariant(U.get())) - Indices.push_back(U.get()); - else - Indices.push_back(getOrCreateVectorValue(U.get(), Part)); - } - - // Create the new GEP. Note that this GEP may be a scalar if VF == 1, - // but it should be a vector, otherwise. - auto *NewGEP = - GEP->isInBounds() - ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr, - Indices) - : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices); - assert((VF == 1 || NewGEP->getType()->isVectorTy()) && - "NewGEP is not a pointer vector"); - VectorLoopValueMap.setVectorValue(&I, Part, NewGEP); - addMetadata(NewGEP, GEP); - } - } - - break; - } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -4064,7 +4281,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::FCmp: { // Widen compares. Generate vector compares. bool FCmp = (I.getOpcode() == Instruction::FCmp); - auto *Cmp = dyn_cast<CmpInst>(&I); + auto *Cmp = cast<CmpInst>(&I); setDebugLocFromInst(Builder, Cmp); for (unsigned Part = 0; Part < UF; ++Part) { Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part); @@ -4097,7 +4314,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - auto *CI = dyn_cast<CastInst>(&I); + auto *CI = cast<CastInst>(&I); setDebugLocFromInst(Builder, CI); /// Vectorize casts. @@ -4195,26 +4412,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { } // end of switch. } -void InnerLoopVectorizer::updateAnalysis() { - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - - // DT is not kept up-to-date for outer loop vectorization - if (EnableVPlanNativePath) - return; - - // Update the dominator tree information. - assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) && - "Entry does not dominate exit."); - - DT->addNewBlock(LoopMiddleBlock, - LI->getLoopFor(LoopVectorBody)->getLoopLatch()); - DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]); - DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); - DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); -} - void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does @@ -4421,9 +4618,11 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne "Widening decision should be ready at this moment"); return WideningDecision == CM_Scalarize; } - return isa<LoadInst>(I) ? - !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) - : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); + const MaybeAlign Alignment = getLoadStoreAlignment(I); + return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || + isLegalMaskedGather(Ty, Alignment)) + : !(isLegalMaskedStore(Ty, Ptr, Alignment) || + isLegalMaskedScatter(Ty, Alignment)); } case Instruction::UDiv: case Instruction::SDiv: @@ -4452,10 +4651,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, // Check if masking is required. // A Group may need masking for one of two reasons: it resides in a block that // needs predication, or it was decided to use masking to deal with gaps. - bool PredicatedAccessRequiresMasking = + bool PredicatedAccessRequiresMasking = Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I); - bool AccessWithGapsRequiresMasking = - Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + bool AccessWithGapsRequiresMasking = + Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking) return true; @@ -4466,8 +4665,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I, "Masked interleave-groups for predicated accesses are not enabled."); auto *Ty = getMemInstValueType(I); - return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty) - : TTI.isLegalMaskedStore(Ty); + const MaybeAlign Alignment = getLoadStoreAlignment(I); + return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) + : TTI.isLegalMaskedStore(Ty, Alignment); } bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, @@ -4525,14 +4725,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { SetVector<Instruction *> Worklist; BasicBlock *Latch = TheLoop->getLoopLatch(); + // Instructions that are scalar with predication must not be considered + // uniform after vectorization, because that would create an erroneous + // replicating region where only a single instance out of VF should be formed. + // TODO: optimize such seldom cases if found important, see PR40816. + auto addToWorklistIfAllowed = [&](Instruction *I) -> void { + if (isScalarWithPredication(I, VF)) { + LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " + << *I << "\n"); + return; + } + LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n"); + Worklist.insert(I); + }; + // Start with the conditional branch. If the branch condition is an // instruction contained in the loop that is only used by the branch, it is // uniform. auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0)); - if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) { - Worklist.insert(Cmp); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n"); - } + if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) + addToWorklistIfAllowed(Cmp); // Holds consecutive and consecutive-like pointers. Consecutive-like pointers // are pointers that are treated like consecutive pointers during @@ -4591,10 +4803,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { // Add to the Worklist all consecutive and consecutive-like pointers that // aren't also identified as possibly non-uniform. for (auto *V : ConsecutiveLikePtrs) - if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) { - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n"); - Worklist.insert(V); - } + if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) + addToWorklistIfAllowed(V); // Expand Worklist in topological order: whenever a new instruction // is added , its users should be already inside Worklist. It ensures @@ -4620,10 +4830,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { return Worklist.count(J) || (OI == getLoadStorePointerOperand(J) && isUniformDecision(J, VF)); - })) { - Worklist.insert(OI); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n"); - } + })) + addToWorklistIfAllowed(OI); } } @@ -4665,92 +4873,103 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) { continue; // The induction variable and its update instruction will remain uniform. - Worklist.insert(Ind); - Worklist.insert(IndUpdate); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n"); - LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate - << "\n"); + addToWorklistIfAllowed(Ind); + addToWorklistIfAllowed(IndUpdate); } Uniforms[VF].insert(Worklist.begin(), Worklist.end()); } -Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { - // TODO: It may by useful to do since it's still likely to be dynamically - // uniform if the target can skip. - LLVM_DEBUG( - dbgs() << "LV: Not inserting runtime ptr check for divergent target"); - - ORE->emit( - createMissedAnalysis("CantVersionLoopWithDivergentTarget") - << "runtime pointer checks needed. Not enabled for divergent target"); - - return None; - } - - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize. - return computeFeasibleMaxVF(OptForSize, TC); +bool LoopVectorizationCostModel::runtimeChecksRequired() { + LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n"); if (Legal->getRuntimePointerChecking()->Need) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime pointer checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n"); - return None; + reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz", + "runtime pointer checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; } if (!PSE.getUnionPredicate().getPredicates().empty()) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime SCEV checks needed. Enable vectorization of this " - "loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n"); - return None; + reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz", + "runtime SCEV checks needed. Enable vectorization of this " + "loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; } // FIXME: Avoid specializing for stride==1 instead of bailing out. if (!Legal->getLAI()->getSymbolicStrides().empty()) { - ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize") - << "runtime stride == 1 checks needed. Enable vectorization of " - "this loop with '#pragma clang loop vectorize(enable)' when " - "compiling with -Os/-Oz"); - LLVM_DEBUG( - dbgs() - << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n"); + reportVectorizationFailure("Runtime stride check is required with -Os/-Oz", + "runtime stride == 1 checks needed. Enable vectorization of " + "this loop with '#pragma clang loop vectorize(enable)' when " + "compiling with -Os/-Oz", + "CantVersionLoopWithOptForSize", ORE, TheLoop); + return true; + } + + return false; +} + +Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() { + if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { + // TODO: It may by useful to do since it's still likely to be dynamically + // uniform if the target can skip. + reportVectorizationFailure( + "Not inserting runtime ptr check for divergent target", + "runtime pointer checks needed. Not enabled for divergent target", + "CantVersionLoopWithDivergentTarget", ORE, TheLoop); return None; } - // If we optimize the program for size, avoid creating the tail loop. + unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); - if (TC == 1) { - ORE->emit(createMissedAnalysis("SingleIterationLoop") - << "loop trip count is one, irrelevant for vectorization"); - LLVM_DEBUG(dbgs() << "LV: Aborting, single iteration (non) loop.\n"); + reportVectorizationFailure("Single iteration (non) loop", + "loop trip count is one, irrelevant for vectorization", + "SingleIterationLoop", ORE, TheLoop); return None; } - // Record that scalar epilogue is not allowed. - LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + switch (ScalarEpilogueStatus) { + case CM_ScalarEpilogueAllowed: + return computeFeasibleMaxVF(TC); + case CM_ScalarEpilogueNotNeededUsePredicate: + LLVM_DEBUG( + dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Not allowing scalar epilogue, creating predicated " + << "vector loop.\n"); + break; + case CM_ScalarEpilogueNotAllowedLowTripLoop: + // fallthrough as a special case of OptForSize + case CM_ScalarEpilogueNotAllowedOptSize: + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize) + LLVM_DEBUG( + dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n"); + else + LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip " + << "count.\n"); - IsScalarEpilogueAllowed = !OptForSize; + // Bail if runtime checks are required, which are not good when optimising + // for size. + if (runtimeChecksRequired()) + return None; + break; + } + + // Now try the tail folding - // We don't create an epilogue when optimizing for size. // Invalidate interleave groups that require an epilogue if we can't mask // the interleave-group. - if (!useMaskedInterleavedAccesses(TTI)) + if (!useMaskedInterleavedAccesses(TTI)) InterleaveInfo.invalidateGroupsRequiringScalarEpilogue(); - unsigned MaxVF = computeFeasibleMaxVF(OptForSize, TC); - + unsigned MaxVF = computeFeasibleMaxVF(TC); if (TC > 0 && TC % MaxVF == 0) { + // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); return MaxVF; } @@ -4759,28 +4978,30 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. - if (Legal->canFoldTailByMasking()) { + if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; return MaxVF; } if (TC == 0) { - ORE->emit( - createMissedAnalysis("UnknownLoopCountComplexCFG") - << "unable to calculate the loop count due to complex control flow"); + reportVectorizationFailure( + "Unable to calculate the loop count due to complex control flow", + "unable to calculate the loop count due to complex control flow", + "UnknownLoopCountComplexCFG", ORE, TheLoop); return None; } - ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize") - << "cannot optimize for size and vectorize at the same time. " - "Enable vectorization of this loop with '#pragma clang loop " - "vectorize(enable)' when compiling with -Os/-Oz"); + reportVectorizationFailure( + "Cannot optimize for size and vectorize at the same time.", + "cannot optimize for size and vectorize at the same time. " + "Enable vectorization of this loop with '#pragma clang loop " + "vectorize(enable)' when compiling with -Os/-Oz", + "NoTailLoopWithOptForSize", ORE, TheLoop); return None; } unsigned -LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, - unsigned ConstTripCount) { +LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) { MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI); unsigned SmallestType, WidestType; std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes(); @@ -4818,8 +5039,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, } unsigned MaxVF = MaxVectorSize; - if (TTI.shouldMaximizeVectorBandwidth(OptForSize) || - (MaximizeBandwidth && !OptForSize)) { + if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) || + (MaximizeBandwidth && isScalarEpilogueAllowed())) { // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorSize). SmallVector<unsigned, 8> VFs; @@ -4832,9 +5053,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, // Select the largest VF which doesn't require more registers than existing // ones. - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true); for (int i = RUs.size() - 1; i >= 0; --i) { - if (RUs[i].MaxLocalUsers <= TargetNumRegisters) { + bool Selected = true; + for (auto& pair : RUs[i].MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + if (pair.second > TargetNumRegisters) + Selected = false; + } + if (Selected) { MaxVF = VFs[i]; break; } @@ -4886,10 +5112,9 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { } if (!EnableCondStoresVectorization && NumPredStores) { - ORE->emit(createMissedAnalysis("ConditionalStore") - << "store that is conditionally executed prevents vectorization"); - LLVM_DEBUG( - dbgs() << "LV: No vectorization. There are conditional stores.\n"); + reportVectorizationFailure("There are conditional stores.", + "store that is conditionally executed prevents vectorization", + "ConditionalStore", ORE, TheLoop); Width = 1; Cost = ScalarCost; } @@ -4958,8 +5183,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { return {MinWidth, MaxWidth}; } -unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, - unsigned VF, +unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF, unsigned LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -4975,35 +5199,25 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - // When we optimize for size, we don't interleave. - if (OptForSize) + if (!isScalarEpilogueAllowed()) return 1; // We used the distance for the interleave count. if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; - // Do not interleave loops with a relatively small trip count. - unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - if (TC > 1 && TC < TinyTripCountInterleaveThreshold) + // Do not interleave loops with a relatively small known or estimated trip + // count. + auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold) return 1; - unsigned TargetNumRegisters = TTI.getNumberOfRegisters(VF > 1); - LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters - << " registers\n"); - - if (VF == 1) { - if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumScalarRegs; - } else { - if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) - TargetNumRegisters = ForceTargetNumVectorRegs; - } - RegisterUsage R = calculateRegisterUsage({VF})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. - R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U); + for (auto& pair : R.MaxLocalUsers) { + pair.second = std::max(pair.second, 1U); + } // We calculate the interleave count using the following formula. // Subtract the number of loop invariants from the number of available @@ -5016,13 +5230,35 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, // We also want power of two interleave counts to ensure that the induction // variable of the vector loop wraps to zero, when tail is folded by masking; // this currently happens when OptForSize, in which case IC is set to 1 above. - unsigned IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs) / - R.MaxLocalUsers); + unsigned IC = UINT_MAX; - // Don't count the induction variable as interleaved. - if (EnableIndVarRegisterHeur) - IC = PowerOf2Floor((TargetNumRegisters - R.LoopInvariantRegs - 1) / - std::max(1U, (R.MaxLocalUsers - 1))); + for (auto& pair : R.MaxLocalUsers) { + unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first); + LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters + << " registers of " + << TTI.getRegisterClassName(pair.first) << " register class\n"); + if (VF == 1) { + if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumScalarRegs; + } else { + if (ForceTargetNumVectorRegs.getNumOccurrences() > 0) + TargetNumRegisters = ForceTargetNumVectorRegs; + } + unsigned MaxLocalUsers = pair.second; + unsigned LoopInvariantRegs = 0; + if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end()) + LoopInvariantRegs = R.LoopInvariantRegs[pair.first]; + + unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers); + // Don't count the induction variable as interleaved. + if (EnableIndVarRegisterHeur) { + TmpIC = + PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) / + std::max(1U, (MaxLocalUsers - 1))); + } + + IC = std::min(IC, TmpIC); + } // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); @@ -5036,6 +5272,12 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } + // If trip count is known or estimated compile time constant, limit the + // interleave count to be less than the trip count divided by VF. + if (BestKnownTC) { + MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount); + } + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) @@ -5044,7 +5286,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize, assert(LoopCost && "Non-zero loop cost expected"); // Clamp the calculated IC to be between the 1 and the max interleave count - // that the target allows. + // that the target and trip count allows. if (IC > MaxInterleaveCount) IC = MaxInterleaveCount; else if (IC < 1) @@ -5196,7 +5438,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { const DataLayout &DL = TheFunction->getParent()->getDataLayout(); SmallVector<RegisterUsage, 8> RUs(VFs.size()); - SmallVector<unsigned, 8> MaxUsages(VFs.size(), 0); + SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size()); LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); @@ -5226,21 +5468,45 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { // For each VF find the maximum usage of registers. for (unsigned j = 0, e = VFs.size(); j < e; ++j) { + // Count the number of live intervals. + SmallMapVector<unsigned, unsigned, 4> RegUsage; + if (VFs[j] == 1) { - MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size()); - continue; + for (auto Inst : OpenIntervals) { + unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = 1; + else + RegUsage[ClassID] += 1; + } + } else { + collectUniformsAndScalars(VFs[j]); + for (auto Inst : OpenIntervals) { + // Skip ignored values for VF > 1. + if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end()) + continue; + if (isScalarAfterVectorization(Inst, VFs[j])) { + unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = 1; + else + RegUsage[ClassID] += 1; + } else { + unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); + if (RegUsage.find(ClassID) == RegUsage.end()) + RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]); + else + RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); + } + } } - collectUniformsAndScalars(VFs[j]); - // Count the number of live intervals. - unsigned RegUsage = 0; - for (auto Inst : OpenIntervals) { - // Skip ignored values for VF > 1. - if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end() || - isScalarAfterVectorization(Inst, VFs[j])) - continue; - RegUsage += GetRegUsage(Inst->getType(), VFs[j]); + + for (auto& pair : RegUsage) { + if (MaxUsages[j].find(pair.first) != MaxUsages[j].end()) + MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second); + else + MaxUsages[j][pair.first] = pair.second; } - MaxUsages[j] = std::max(MaxUsages[j], RegUsage); } LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " @@ -5251,18 +5517,34 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) { } for (unsigned i = 0, e = VFs.size(); i < e; ++i) { - unsigned Invariant = 0; - if (VFs[i] == 1) - Invariant = LoopInvariants.size(); - else { - for (auto Inst : LoopInvariants) - Invariant += GetRegUsage(Inst->getType(), VFs[i]); + SmallMapVector<unsigned, unsigned, 4> Invariant; + + for (auto Inst : LoopInvariants) { + unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType()); + if (Invariant.find(ClassID) == Invariant.end()) + Invariant[ClassID] = Usage; + else + Invariant[ClassID] += Usage; } - LLVM_DEBUG(dbgs() << "LV(REG): VF = " << VFs[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsages[i] << '\n'); - LLVM_DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant - << '\n'); + LLVM_DEBUG({ + dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; + dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() + << " item\n"; + for (const auto &pair : MaxUsages[i]) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() + << " item\n"; + for (const auto &pair : Invariant) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + }); RU.LoopInvariantRegs = Invariant; RU.MaxLocalUsers = MaxUsages[i]; @@ -5511,7 +5793,6 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, Type *ValTy = getMemInstValueType(I); auto SE = PSE.getSE(); - unsigned Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); Value *Ptr = getLoadStorePointerOperand(I); Type *PtrTy = ToVectorTy(Ptr->getType(), VF); @@ -5525,9 +5806,9 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // Don't pass *I here, since it is scalar but will actually be part of a // vectorized loop where the user of it is a vectorized instruction. - Cost += VF * - TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment, - AS); + const MaybeAlign Alignment = getLoadStoreAlignment(I); + Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), + Alignment, AS); // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. @@ -5552,16 +5833,17 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); Value *Ptr = getLoadStorePointerOperand(I); unsigned AS = getLoadStoreAddressSpace(I); int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Stride should be 1 or -1 for consecutive memory access"); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned Cost = 0; if (Legal->isMaskRequired(I)) - Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, + Alignment ? Alignment->value() : 0, AS); else Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I); @@ -5575,7 +5857,7 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); if (isa<LoadInst>(I)) { return TTI.getAddressComputationCost(ValTy) + @@ -5587,21 +5869,23 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I, bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand()); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) + - (isLoopInvariantStoreValue ? 0 : TTI.getVectorInstrCost( - Instruction::ExtractElement, - VectorTy, VF - 1)); + (isLoopInvariantStoreValue + ? 0 + : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy, + VF - 1)); } unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, unsigned VF) { Type *ValTy = getMemInstValueType(I); Type *VectorTy = ToVectorTy(ValTy, VF); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); Value *Ptr = getLoadStorePointerOperand(I); return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr, - Legal->isMaskRequired(I), Alignment); + Legal->isMaskRequired(I), + Alignment ? Alignment->value() : 0); } unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, @@ -5626,8 +5910,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, } // Calculate the cost of the whole interleaved group. - bool UseMaskForGaps = - Group->requiresScalarEpilogue() && !IsScalarEpilogueAllowed; + bool UseMaskForGaps = + Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); unsigned Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); @@ -5648,7 +5932,7 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, // moment. if (VF == 1) { Type *ValTy = getMemInstValueType(I); - unsigned Alignment = getLoadStoreAlignment(I); + const MaybeAlign Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); return TTI.getAddressComputationCost(ValTy) + @@ -5991,7 +6275,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; return N * TTI.getArithmeticInstrCost( I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, - Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands); + Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I); } case Instruction::FNeg: { unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1; @@ -5999,7 +6283,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None, - I->getOperand(0)); + I->getOperand(0), I); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); @@ -6167,8 +6451,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits, } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, - unsigned UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) { unsigned VF = UserVF; // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. @@ -6207,10 +6490,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize, return VectorizationFactor::Disabled(); } -Optional<VectorizationFactor> LoopVectorizationPlanner::plan(bool OptForSize, - unsigned UserVF) { +Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) { assert(OrigLoop->empty() && "Inner loop expected."); - Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize); + Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(); if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved. return None; @@ -6434,7 +6716,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst, BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); assert(BI && "Unexpected terminator found"); - if (!BI->isConditional()) + if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; VPValue *EdgeMask = Plan->getVPValue(BI->getCondition()); @@ -6490,37 +6772,6 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { return BlockMaskCache[BB] = BlockMask; } -VPInterleaveRecipe *VPRecipeBuilder::tryToInterleaveMemory(Instruction *I, - VFRange &Range, - VPlanPtr &Plan) { - const InterleaveGroup<Instruction> *IG = CM.getInterleavedAccessGroup(I); - if (!IG) - return nullptr; - - // Now check if IG is relevant for VF's in the given range. - auto isIGMember = [&](Instruction *I) -> std::function<bool(unsigned)> { - return [=](unsigned VF) -> bool { - return (VF >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(I, VF) == - LoopVectorizationCostModel::CM_Interleave); - }; - }; - if (!LoopVectorizationPlanner::getDecisionAndClampRange(isIGMember(I), Range)) - return nullptr; - - // I is a member of an InterleaveGroup for VF's in the (possibly trimmed) - // range. If it's the primary member of the IG construct a VPInterleaveRecipe. - // Otherwise, it's an adjunct member of the IG, do not construct any Recipe. - assert(I == IG->getInsertPos() && - "Generating a recipe for an adjunct member of an interleave group"); - - VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) - Mask = createBlockInMask(I->getParent(), Plan); - - return new VPInterleaveRecipe(IG, Mask); -} - VPWidenMemoryInstructionRecipe * VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan) { @@ -6530,15 +6781,15 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, auto willWiden = [&](unsigned VF) -> bool { if (VF == 1) return false; - if (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF)) - return false; LoopVectorizationCostModel::InstWidening Decision = CM.getWideningDecision(I, VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point."); - assert(Decision != LoopVectorizationCostModel::CM_Interleave && - "Interleave memory opportunity should be caught earlier."); + if (Decision == LoopVectorizationCostModel::CM_Interleave) + return true; + if (CM.isScalarAfterVectorization(I, VF) || + CM.isProfitableToScalarize(I, VF)) + return false; return Decision != LoopVectorizationCostModel::CM_Scalarize; }; @@ -6549,7 +6800,8 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, if (Legal->isMaskRequired(I)) Mask = createBlockInMask(I->getParent(), Plan); - return new VPWidenMemoryInstructionRecipe(*I, Mask); + VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); + return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask); } VPWidenIntOrFpInductionRecipe * @@ -6637,7 +6889,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, case Instruction::FPTrunc: case Instruction::FRem: case Instruction::FSub: - case Instruction::GetElementPtr: case Instruction::ICmp: case Instruction::IntToPtr: case Instruction::Load: @@ -6702,16 +6953,23 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) return false; + // If this ingredient's recipe is to be recorded, keep its recipe a singleton + // to avoid having to split recipes later. + bool IsSingleton = Ingredient2Recipe.count(I); + + // Success: widen this instruction. - // Success: widen this instruction. We optimize the common case where + // Use the default widening recipe. We optimize the common case where // consecutive instructions can be represented by a single recipe. - if (!VPBB->empty()) { - VPWidenRecipe *LastWidenRecipe = dyn_cast<VPWidenRecipe>(&VPBB->back()); - if (LastWidenRecipe && LastWidenRecipe->appendInstruction(I)) - return true; - } + if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() && + LastExtensibleRecipe->appendInstruction(I)) + return true; - VPBB->appendRecipe(new VPWidenRecipe(I)); + VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I); + if (!IsSingleton) + LastExtensibleRecipe = WidenRecipe; + setRecipe(I, WidenRecipe); + VPBB->appendRecipe(WidenRecipe); return true; } @@ -6727,6 +6985,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication( [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); + setRecipe(I, Recipe); // Find if I uses a predicated instruction. If so, it will use its scalar // value. Avoid hoisting the insert-element which packs the scalar value into @@ -6785,36 +7044,36 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr, bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, VPBasicBlock *VPBB) { VPRecipeBase *Recipe = nullptr; - // Check if Instr should belong to an interleave memory recipe, or already - // does. In the latter case Instr is irrelevant. - if ((Recipe = tryToInterleaveMemory(Instr, Range, Plan))) { - VPBB->appendRecipe(Recipe); - return true; - } - // Check if Instr is a memory operation that should be widened. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan))) { + // First, check for specific widening recipes that deal with memory + // operations, inductions and Phi nodes. + if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || + (Recipe = tryToOptimizeInduction(Instr, Range)) || + (Recipe = tryToBlend(Instr, Plan)) || + (isa<PHINode>(Instr) && + (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) { + setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; } - // Check if Instr should form some PHI recipe. - if ((Recipe = tryToOptimizeInduction(Instr, Range))) { - VPBB->appendRecipe(Recipe); - return true; - } - if ((Recipe = tryToBlend(Instr, Plan))) { + // Handle GEP widening. + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { + auto Scalarize = [&](unsigned VF) { + return CM.isScalarWithPredication(Instr, VF) || + CM.isScalarAfterVectorization(Instr, VF) || + CM.isProfitableToScalarize(Instr, VF); + }; + if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range)) + return false; + VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop); + setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); return true; } - if (PHINode *Phi = dyn_cast<PHINode>(Instr)) { - VPBB->appendRecipe(new VPWidenPHIRecipe(Phi)); - return true; - } // Check if Instr is to be widened by a general VPWidenRecipe, after - // having first checked for specific widening recipes that deal with - // Interleave Groups, Inductions and Phi nodes. + // having first checked for specific widening recipes. if (tryToWiden(Instr, VPBB, Range)) return true; @@ -6840,8 +7099,15 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, // If the tail is to be folded by masking, the primary induction variable // needs to be represented in VPlan for it to model early-exit masking. - if (CM.foldTailByMasking()) + // Also, both the Phi and the live-out instruction of each reduction are + // required in order to introduce a select between them in VPlan. + if (CM.foldTailByMasking()) { NeedDef.insert(Legal->getPrimaryInduction()); + for (auto &Reduction : *Legal->getReductionVars()) { + NeedDef.insert(Reduction.first); + NeedDef.insert(Reduction.second.getLoopExitInstr()); + } + } // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -6852,30 +7118,72 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF, SmallPtrSet<Instruction *, 4> DeadInstructions; collectTriviallyDeadInstructions(DeadInstructions); + DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); + // Dead instructions do not need sinking. Remove them from SinkAfter. + for (Instruction *I : DeadInstructions) + SinkAfter.erase(I); + for (unsigned VF = MinVF; VF < MaxVF + 1;) { VFRange SubRange = {VF, MaxVF + 1}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions)); + VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef, + DeadInstructions, SinkAfter)); VF = SubRange.End; } } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef, - SmallPtrSetImpl<Instruction *> &DeadInstructions) { + SmallPtrSetImpl<Instruction *> &DeadInstructions, + const DenseMap<Instruction *, Instruction *> &SinkAfter) { + // Hold a mapping from predicated instructions to their recipes, in order to // fix their AlsoPack behavior if a user is determined to replicate and use a // scalar instead of vector value. DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe; - DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter(); - DenseMap<Instruction *, Instruction *> SinkAfterInverse; + SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups; + + VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); + + // --------------------------------------------------------------------------- + // Pre-construction: record ingredients whose recipes we'll need to further + // process after constructing the initial VPlan. + // --------------------------------------------------------------------------- + + // Mark instructions we'll need to sink later and their targets as + // ingredients whose recipe we'll need to record. + for (auto &Entry : SinkAfter) { + RecipeBuilder.recordRecipeOf(Entry.first); + RecipeBuilder.recordRecipeOf(Entry.second); + } + + // For each interleave group which is relevant for this (possibly trimmed) + // Range, add it to the set of groups to be later applied to the VPlan and add + // placeholders for its members' Recipes which we'll be replacing with a + // single VPInterleaveRecipe. + for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) { + auto applyIG = [IG, this](unsigned VF) -> bool { + return (VF >= 2 && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), VF) == + LoopVectorizationCostModel::CM_Interleave); + }; + if (!getDecisionAndClampRange(applyIG, Range)) + continue; + InterleaveGroups.insert(IG); + for (unsigned i = 0; i < IG->getFactor(); i++) + if (Instruction *Member = IG->getMember(i)) + RecipeBuilder.recordRecipeOf(Member); + }; + + // --------------------------------------------------------------------------- + // Build initial VPlan: Scan the body of the loop in a topological order to + // visit each basic block after having visited its predecessor basic blocks. + // --------------------------------------------------------------------------- // Create a dummy pre-entry VPBasicBlock to start building the VPlan. VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry"); - auto Plan = llvm::make_unique<VPlan>(VPBB); + auto Plan = std::make_unique<VPlan>(VPBB); - VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder); // Represent values that will have defs inside VPlan. for (Value *V : NeedDef) Plan->addVPValue(V); @@ -6894,10 +7202,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBB = FirstVPBBForBB; Builder.setInsertPoint(VPBB); - std::vector<Instruction *> Ingredients; - - // Organize the ingredients to vectorize from current basic block in the - // right order. + // Introduce each ingredient into VPlan. for (Instruction &I : BB->instructionsWithoutDebug()) { Instruction *Instr = &I; @@ -6907,43 +7212,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( DeadInstructions.find(Instr) != DeadInstructions.end()) continue; - // I is a member of an InterleaveGroup for Range.Start. If it's an adjunct - // member of the IG, do not construct any Recipe for it. - const InterleaveGroup<Instruction> *IG = - CM.getInterleavedAccessGroup(Instr); - if (IG && Instr != IG->getInsertPos() && - Range.Start >= 2 && // Query is illegal for VF == 1 - CM.getWideningDecision(Instr, Range.Start) == - LoopVectorizationCostModel::CM_Interleave) { - auto SinkCandidate = SinkAfterInverse.find(Instr); - if (SinkCandidate != SinkAfterInverse.end()) - Ingredients.push_back(SinkCandidate->second); - continue; - } - - // Move instructions to handle first-order recurrences, step 1: avoid - // handling this instruction until after we've handled the instruction it - // should follow. - auto SAIt = SinkAfter.find(Instr); - if (SAIt != SinkAfter.end()) { - LLVM_DEBUG(dbgs() << "Sinking" << *SAIt->first << " after" - << *SAIt->second - << " to vectorize a 1st order recurrence.\n"); - SinkAfterInverse[SAIt->second] = Instr; - continue; - } - - Ingredients.push_back(Instr); - - // Move instructions to handle first-order recurrences, step 2: push the - // instruction to be sunk at its insertion point. - auto SAInvIt = SinkAfterInverse.find(Instr); - if (SAInvIt != SinkAfterInverse.end()) - Ingredients.push_back(SAInvIt->second); - } - - // Introduce each ingredient into VPlan. - for (Instruction *Instr : Ingredients) { if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB)) continue; @@ -6968,6 +7236,45 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; + // --------------------------------------------------------------------------- + // Transform initial VPlan: Apply previously taken decisions, in order, to + // bring the VPlan to its final state. + // --------------------------------------------------------------------------- + + // Apply Sink-After legal constraints. + for (auto &Entry : SinkAfter) { + VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first); + VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second); + Sink->moveAfter(Target); + } + + // Interleave memory: for each Interleave Group we marked earlier as relevant + // for this VPlan, replace the Recipes widening its memory instructions with a + // single VPInterleaveRecipe at its insertion point. + for (auto IG : InterleaveGroups) { + auto *Recipe = cast<VPWidenMemoryInstructionRecipe>( + RecipeBuilder.getRecipe(IG->getInsertPos())); + (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask())) + ->insertBefore(Recipe); + + for (unsigned i = 0; i < IG->getFactor(); ++i) + if (Instruction *Member = IG->getMember(i)) { + RecipeBuilder.getRecipe(Member)->eraseFromParent(); + } + } + + // Finally, if tail is folded by masking, introduce selects between the phi + // and the live-out instruction of each reduction, at the end of the latch. + if (CM.foldTailByMasking()) { + Builder.setInsertPoint(VPBB); + auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); + for (auto &Reduction : *Legal->getReductionVars()) { + VPValue *Phi = Plan->getVPValue(Reduction.first); + VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); + Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); + } + } + std::string PlanName; raw_string_ostream RSO(PlanName); unsigned VF = Range.Start; @@ -6993,7 +7300,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = llvm::make_unique<VPlan>(); + auto Plan = std::make_unique<VPlan>(); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); @@ -7012,9 +7319,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { } SmallPtrSet<Instruction *, 1> DeadInstructions; - VPlanHCFGTransforms::VPInstructionsToVPRecipes( - Plan, Legal->getInductionVars(), DeadInstructions); - + VPlanTransforms::VPInstructionsToVPRecipes( + OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); return Plan; } @@ -7023,13 +7329,21 @@ getOrCreateVectorValues(Value *V, unsigned Part) { return ILV.getOrCreateVectorValue(V, Part); } +Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue( + Value *V, const VPIteration &Instance) { + return ILV.getOrCreateScalarValue(V, Instance); +} + void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); - if (User) { + O << ", "; + getAddr()->printAsOperand(O); + VPValue *Mask = getMask(); + if (Mask) { O << ", "; - User->getOperand(0)->printAsOperand(O); + Mask->printAsOperand(O); } O << "\\l\""; for (unsigned i = 0; i < IG->getFactor(); ++i) @@ -7043,6 +7357,11 @@ void VPWidenRecipe::execute(VPTransformState &State) { State.ILV->widenInstruction(Instr); } +void VPWidenGEPRecipe::execute(VPTransformState &State) { + State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant, + IsIndexLoopInvariant); +} + void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Int or FP induction being replicated."); State.ILV->widenIntOrFpInduction(IV, Trunc); @@ -7093,15 +7412,8 @@ void VPBlendRecipe::execute(VPTransformState &State) { void VPInterleaveRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Interleave group being replicated."); - if (!User) - return State.ILV->vectorizeInterleaveGroup(IG->getInsertPos()); - - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), &MaskValues); + State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(), + getMask()); } void VPReplicateRecipe::execute(VPTransformState &State) { @@ -7188,15 +7500,46 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { } void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { - if (!User) - return State.ILV->vectorizeMemoryInstruction(&Instr); + State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask()); +} - // Last (and currently only) operand is a mask. - InnerLoopVectorizer::VectorParts MaskValues(State.UF); - VPValue *Mask = User->getOperand(User->getNumOperands() - 1); - for (unsigned Part = 0; Part < State.UF; ++Part) - MaskValues[Part] = State.get(Mask, Part); - State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues); +// Determine how to lower the scalar epilogue, which depends on 1) optimising +// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing +// predication, and 4) a TTI hook that analyses whether the loop is suitable +// for predication. +static ScalarEpilogueLowering getScalarEpilogueLowering( + Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, + BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, + AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, + LoopVectorizationLegality &LVL) { + bool OptSize = + F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, + PGSOQueryType::IRPass); + // 1) OptSize takes precedence over all other options, i.e. if this is set, + // don't look at hints or options, and don't request a scalar epilogue. + if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled) + return CM_ScalarEpilogueNotAllowedOptSize; + + bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() && + !PreferPredicateOverEpilog; + + // 2) Next, if disabling predication is requested on the command line, honour + // this and request a scalar epilogue. Also do this if we don't have a + // primary induction variable, which is required for predication. + if (PredicateOptDisabled || !LVL.getPrimaryInduction()) + return CM_ScalarEpilogueAllowed; + + // 3) and 4) look if enabling predication is requested on the command line, + // with a loop hint, or if the TTI hook indicates this is profitable, request + // predication . + if (PreferPredicateOverEpilog || + Hints.getPredicate() == LoopVectorizeHints::FK_Enabled || + (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, + LVL.getLAI()) && + Hints.getPredicate() != LoopVectorizeHints::FK_Disabled)) + return CM_ScalarEpilogueNotNeededUsePredicate; + + return CM_ScalarEpilogueAllowed; } // Process the loop in the VPlan-native vectorization path. This path builds @@ -7213,25 +7556,22 @@ static bool processLoopInVPlanNativePath( assert(EnableVPlanNativePath && "VPlan-native path is disabled."); Function *F = L->getHeader()->getParent(); InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); - LoopVectorizationCostModel CM(L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, + + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); + + LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI); // Get user vectorization factor. const unsigned UserVF = Hints.getWidth(); - // Check the function attributes and profiles to find out if this function - // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); - // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -7310,10 +7650,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the function attributes and profiles to find out if this function // should be optimized for size. - bool OptForSize = - Hints.getForce() != LoopVectorizeHints::FK_Enabled && - (F->hasOptSize() || - llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)); + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before @@ -7325,36 +7663,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { ORE, BFI, PSI, Hints); assert(L->empty() && "Inner loop expected."); + // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - // Prefer constant trip counts over profile data, over upper bound estimate. - unsigned ExpectedTC = 0; - bool HasExpectedTC = false; - if (const SCEVConstant *ConstExits = - dyn_cast<SCEVConstant>(SE->getBackedgeTakenCount(L))) { - const APInt &ExitsCount = ConstExits->getAPInt(); - // We are interested in small values for ExpectedTC. Skip over those that - // can't fit an unsigned. - if (ExitsCount.ult(std::numeric_limits<unsigned>::max())) { - ExpectedTC = static_cast<unsigned>(ExitsCount.getZExtValue()) + 1; - HasExpectedTC = true; - } - } - // ExpectedTC may be large because it's bound by a variable. Check - // profiling information to validate we should vectorize. - if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) { - auto EstimatedTC = getLoopEstimatedTripCount(L); - if (EstimatedTC) { - ExpectedTC = *EstimatedTC; - HasExpectedTC = true; - } - } - if (!HasExpectedTC) { - ExpectedTC = SE->getSmallConstantMaxTripCount(L); - HasExpectedTC = (ExpectedTC > 0); - } - - if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) { + auto ExpectedTC = getSmallBestKnownTC(*SE, L); + if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " << "iteration overheads are incurred."); @@ -7362,10 +7675,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n"); else { LLVM_DEBUG(dbgs() << "\n"); - // Loops with a very small trip count are considered for vectorization - // under OptForSize, thereby making sure the cost of their loop body is - // dominant, free of runtime guards and scalar iteration overheads. - OptForSize = true; + SEL = CM_ScalarEpilogueNotAllowedLowTripLoop; } } @@ -7374,11 +7684,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // an integer loop and the vector instructions selected are purely integer // vector instructions? if (F->hasFnAttribute(Attribute::NoImplicitFloat)) { - LLVM_DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat" - "attribute is used.\n"); - ORE->emit(createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), - "NoImplicitFloat", L) - << "loop not vectorized due to NoImplicitFloat attribute"); + reportVectorizationFailure( + "Can't vectorize when the NoImplicitFloat attribute is used", + "loop not vectorized due to NoImplicitFloat attribute", + "NoImplicitFloat", ORE, L); Hints.emitRemarkWithHints(); return false; } @@ -7389,11 +7698,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { // additional fp-math flags can help. if (Hints.isPotentiallyUnsafe() && TTI->isFPVectorizationPotentiallyUnsafe()) { - LLVM_DEBUG( - dbgs() << "LV: Potentially unsafe FP op prevents vectorization.\n"); - ORE->emit( - createLVMissedAnalysis(Hints.vectorizeAnalysisPassName(), "UnsafeFP", L) - << "loop not vectorized due to unsafe FP support."); + reportVectorizationFailure( + "Potentially unsafe FP op prevents vectorization", + "loop not vectorized due to unsafe FP support.", + "UnsafeFP", ORE, L); Hints.emitRemarkWithHints(); return false; } @@ -7411,18 +7719,18 @@ bool LoopVectorizePass::processLoop(Loop *L) { } // Use the cost model. - LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, + F, &Hints, IAI); CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); // Plan how to best vectorize, return the best VF and its cost. - Optional<VectorizationFactor> MaybeVF = LVP.plan(OptForSize, UserVF); + Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -7431,7 +7739,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } // Identify the diagnostic messages that should be produced. @@ -7609,7 +7917,8 @@ bool LoopVectorizePass::runImpl( // The second condition is necessary because, even if the target has no // vector registers, loop vectorization may still enable scalar // interleaving. - if (!TTI->getNumberOfRegisters(true) && TTI->getMaxInterleaveFactor(1) < 2) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) && + TTI->getMaxInterleaveFactor(1) < 2) return false; bool Changed = false; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 27a86c0bca91..479bca83b51e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -72,6 +73,7 @@ #include "llvm/IR/Value.h" #include "llvm/IR/ValueHandle.h" #include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" @@ -127,6 +129,10 @@ static cl::opt<int> MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +static cl::opt<int> +MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, + cl::desc("Maximum depth of the lookup for consecutive stores.")); + /// Limits the size of scheduling regions in a block. /// It avoid long compile times for _very_ large blocks where vector /// instructions are spread over a wide range. @@ -147,6 +153,20 @@ static cl::opt<unsigned> MinTreeSize( "slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable")); +// The maximum depth that the look-ahead score heuristic will explore. +// The higher this value, the higher the compilation time overhead. +static cl::opt<int> LookAheadMaxDepth( + "slp-max-look-ahead-depth", cl::init(2), cl::Hidden, + cl::desc("The maximum look-ahead depth for operand reordering scores")); + +// The Look-ahead heuristic goes through the users of the bundle to calculate +// the users cost in getExternalUsesCost(). To avoid compilation time increase +// we limit the number of users visited to this value. +static cl::opt<unsigned> LookAheadUsersBudget( + "slp-look-ahead-users-budget", cl::init(2), cl::Hidden, + cl::desc("The maximum number of users to visit while visiting the " + "predecessors. This prevents compilation time increase.")); + static cl::opt<bool> ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -194,10 +214,13 @@ static bool allSameBlock(ArrayRef<Value *> VL) { return true; } -/// \returns True if all of the values in \p VL are constants. +/// \returns True if all of the values in \p VL are constants (but not +/// globals/constant expressions). static bool allConstant(ArrayRef<Value *> VL) { + // Constant expressions and globals can't be vectorized like normal integer/FP + // constants. for (Value *i : VL) - if (!isa<Constant>(i)) + if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i)) return false; return true; } @@ -354,6 +377,18 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) { return S.OpValue; } +/// \returns true if \p Opcode is allowed as part of of the main/alternate +/// instruction for SLP vectorization. +/// +/// Example of unsupported opcode is SDIV that can potentially cause UB if the +/// "shuffled out" lane would result in division by zero. +static bool isValidForAlternation(unsigned Opcode) { + if (Instruction::isIntDivRem(Opcode)) + return false; + + return true; +} + /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. @@ -376,7 +411,8 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; - if (Opcode == AltOpcode) { + if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) && + isValidForAlternation(Opcode)) { AltOpcode = InstOpcode; AltIndex = Cnt; continue; @@ -388,6 +424,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL, if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; if (Opcode == AltOpcode) { + assert(isValidForAlternation(Opcode) && + isValidForAlternation(InstOpcode) && + "Cast isn't safe for alternation, logic needs to be updated!"); AltOpcode = InstOpcode; AltIndex = Cnt; continue; @@ -486,6 +525,7 @@ namespace slpvectorizer { /// Bottom Up SLP Vectorizer. class BoUpSLP { struct TreeEntry; + struct ScheduleData; public: using ValueList = SmallVector<Value *, 8>; @@ -543,7 +583,7 @@ public: /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking - /// into account (anf updating it, if required) list of externally used + /// into account (and updating it, if required) list of externally used /// values stored in \p ExternallyUsedValues. void buildTree(ArrayRef<Value *> Roots, ExtraValueToDebugLocsMap &ExternallyUsedValues, @@ -605,7 +645,10 @@ public: return MinVecRegSize; } - /// Check if ArrayType or StructType is isomorphic to some VectorType. + /// Check if homogeneous aggregate is isomorphic to some VectorType. + /// Accepts homogeneous multidimensional aggregate of scalars/vectors like + /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, + /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. /// /// \returns number of elements in vector if isomorphism exists, 0 otherwise. unsigned canMapToVector(Type *T, const DataLayout &DL) const; @@ -614,6 +657,15 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable() const; + /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values + /// can be load combined in the backend. Load combining may not be allowed in + /// the IR optimizer, so we do not want to alter the pattern. For example, + /// partially transforming a scalar bswap() pattern into vector code is + /// effectively impossible for the backend to undo. + /// TODO: If load combining is allowed in the IR optimizer, this analysis + /// may not be necessary. + bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const; + OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -708,6 +760,7 @@ public: const DataLayout &DL; ScalarEvolution &SE; + const BoUpSLP &R; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -733,6 +786,227 @@ public: std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]); } + // The hard-coded scores listed here are not very important. When computing + // the scores of matching one sub-tree with another, we are basically + // counting the number of values that are matching. So even if all scores + // are set to 1, we would still get a decent matching result. + // However, sometimes we have to break ties. For example we may have to + // choose between matching loads vs matching opcodes. This is what these + // scores are helping us with: they provide the order of preference. + + /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). + static const int ScoreConsecutiveLoads = 3; + /// ExtractElementInst from same vector and consecutive indexes. + static const int ScoreConsecutiveExtracts = 3; + /// Constants. + static const int ScoreConstants = 2; + /// Instructions with the same opcode. + static const int ScoreSameOpcode = 2; + /// Instructions with alt opcodes (e.g, add + sub). + static const int ScoreAltOpcodes = 1; + /// Identical instructions (a.k.a. splat or broadcast). + static const int ScoreSplat = 1; + /// Matching with an undef is preferable to failing. + static const int ScoreUndef = 1; + /// Score for failing to find a decent match. + static const int ScoreFail = 0; + /// User exteranl to the vectorized code. + static const int ExternalUseCost = 1; + /// The user is internal but in a different lane. + static const int UserInDiffLaneCost = ExternalUseCost; + + /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. + static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL, + ScalarEvolution &SE) { + auto *LI1 = dyn_cast<LoadInst>(V1); + auto *LI2 = dyn_cast<LoadInst>(V2); + if (LI1 && LI2) + return isConsecutiveAccess(LI1, LI2, DL, SE) + ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + + auto *C1 = dyn_cast<Constant>(V1); + auto *C2 = dyn_cast<Constant>(V2); + if (C1 && C2) + return VLOperands::ScoreConstants; + + // Extracts from consecutive indexes of the same vector better score as + // the extracts could be optimized away. + auto *Ex1 = dyn_cast<ExtractElementInst>(V1); + auto *Ex2 = dyn_cast<ExtractElementInst>(V2); + if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() && + cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 == + cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) { + return VLOperands::ScoreConsecutiveExtracts; + } + + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (I1 && I2) { + if (I1 == I2) + return VLOperands::ScoreSplat; + InstructionsState S = getSameOpcode({I1, I2}); + // Note: Only consider instructions with <= 2 operands to avoid + // complexity explosion. + if (S.getOpcode() && S.MainOp->getNumOperands() <= 2) + return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes + : VLOperands::ScoreSameOpcode; + } + + if (isa<UndefValue>(V2)) + return VLOperands::ScoreUndef; + + return VLOperands::ScoreFail; + } + + /// Holds the values and their lane that are taking part in the look-ahead + /// score calculation. This is used in the external uses cost calculation. + SmallDenseMap<Value *, int> InLookAheadValues; + + /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are + /// either external to the vectorized code, or require shuffling. + int getExternalUsesCost(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS) { + int Cost = 0; + SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS}; + for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) { + Value *V = Values[Idx].first; + // Calculate the absolute lane, using the minimum relative lane of LHS + // and RHS as base and Idx as the offset. + int Ln = std::min(LHS.second, RHS.second) + Idx; + assert(Ln >= 0 && "Bad lane calculation"); + unsigned UsersBudget = LookAheadUsersBudget; + for (User *U : V->users()) { + if (const TreeEntry *UserTE = R.getTreeEntry(U)) { + // The user is in the VectorizableTree. Check if we need to insert. + auto It = llvm::find(UserTE->Scalars, U); + assert(It != UserTE->Scalars.end() && "U is in UserTE"); + int UserLn = std::distance(UserTE->Scalars.begin(), It); + assert(UserLn >= 0 && "Bad lane"); + if (UserLn != Ln) + Cost += UserInDiffLaneCost; + } else { + // Check if the user is in the look-ahead code. + auto It2 = InLookAheadValues.find(U); + if (It2 != InLookAheadValues.end()) { + // The user is in the look-ahead code. Check the lane. + if (It2->second != Ln) + Cost += UserInDiffLaneCost; + } else { + // The user is neither in SLP tree nor in the look-ahead code. + Cost += ExternalUseCost; + } + } + // Limit the number of visited uses to cap compilation time. + if (--UsersBudget == 0) + break; + } + } + return Cost; + } + + /// Go through the operands of \p LHS and \p RHS recursively until \p + /// MaxLevel, and return the cummulative score. For example: + /// \verbatim + /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] + /// \ / \ / \ / \ / + /// + + + + + /// G1 G2 G3 G4 + /// \endverbatim + /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at + /// each level recursively, accumulating the score. It starts from matching + /// the additions at level 0, then moves on to the loads (level 1). The + /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and + /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while + /// {A[0],C[0]} has a score of VLOperands::ScoreFail. + /// Please note that the order of the operands does not matter, as we + /// evaluate the score of all profitable combinations of operands. In + /// other words the score of G1 and G4 is the same as G1 and G2. This + /// heuristic is based on ideas described in: + /// Look-ahead SLP: Auto-vectorization in the presence of commutative + /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, + /// LuÃs F. W. Góes + int getScoreAtLevelRec(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS, int CurrLevel, + int MaxLevel) { + + Value *V1 = LHS.first; + Value *V2 = RHS.first; + // Get the shallow score of V1 and V2. + int ShallowScoreAtThisLevel = + std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) - + getExternalUsesCost(LHS, RHS)); + int Lane1 = LHS.second; + int Lane2 = RHS.second; + + // If reached MaxLevel, + // or if V1 and V2 are not instructions, + // or if they are SPLAT, + // or if they are not consecutive, early return the current cost. + auto *I1 = dyn_cast<Instruction>(V1); + auto *I2 = dyn_cast<Instruction>(V2); + if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || + ShallowScoreAtThisLevel == VLOperands::ScoreFail || + (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel)) + return ShallowScoreAtThisLevel; + assert(I1 && I2 && "Should have early exited."); + + // Keep track of in-tree values for determining the external-use cost. + InLookAheadValues[V1] = Lane1; + InLookAheadValues[V2] = Lane2; + + // Contains the I2 operand indexes that got matched with I1 operands. + SmallSet<unsigned, 4> Op2Used; + + // Recursion towards the operands of I1 and I2. We are trying all possbile + // operand pairs, and keeping track of the best score. + for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); + OpIdx1 != NumOperands1; ++OpIdx1) { + // Try to pair op1I with the best operand of I2. + int MaxTmpScore = 0; + unsigned MaxOpIdx2 = 0; + bool FoundBest = false; + // If I2 is commutative try all combinations. + unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1; + unsigned ToIdx = isCommutative(I2) + ? I2->getNumOperands() + : std::min(I2->getNumOperands(), OpIdx1 + 1); + assert(FromIdx <= ToIdx && "Bad index"); + for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { + // Skip operands already paired with OpIdx1. + if (Op2Used.count(OpIdx2)) + continue; + // Recursively calculate the cost at each level + int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1}, + {I2->getOperand(OpIdx2), Lane2}, + CurrLevel + 1, MaxLevel); + // Look for the best score. + if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) { + MaxTmpScore = TmpScore; + MaxOpIdx2 = OpIdx2; + FoundBest = true; + } + } + if (FoundBest) { + // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. + Op2Used.insert(MaxOpIdx2); + ShallowScoreAtThisLevel += MaxTmpScore; + } + } + return ShallowScoreAtThisLevel; + } + + /// \Returns the look-ahead score, which tells us how much the sub-trees + /// rooted at \p LHS and \p RHS match, the more they match the higher the + /// score. This helps break ties in an informed way when we cannot decide on + /// the order of the operands by just considering the immediate + /// predecessors. + int getLookAheadScore(const std::pair<Value *, int> &LHS, + const std::pair<Value *, int> &RHS) { + InLookAheadValues.clear(); + return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth); + } + // Search all operands in Ops[*][Lane] for the one that matches best // Ops[OpIdx][LastLane] and return its opreand index. // If no good match can be found, return None. @@ -750,9 +1024,6 @@ public: // The linearized opcode of the operand at OpIdx, Lane. bool OpIdxAPO = getData(OpIdx, Lane).APO; - const unsigned BestScore = 2; - const unsigned GoodScore = 1; - // The best operand index and its score. // Sometimes we have more than one option (e.g., Opcode and Undefs), so we // are using the score to differentiate between the two. @@ -781,41 +1052,19 @@ public: // Look for an operand that matches the current mode. switch (RMode) { case ReorderingMode::Load: - if (isa<LoadInst>(Op)) { - // Figure out which is left and right, so that we can check for - // consecutive loads - bool LeftToRight = Lane > LastLane; - Value *OpLeft = (LeftToRight) ? OpLastLane : Op; - Value *OpRight = (LeftToRight) ? Op : OpLastLane; - if (isConsecutiveAccess(cast<LoadInst>(OpLeft), - cast<LoadInst>(OpRight), DL, SE)) - BestOp.Idx = Idx; - } - break; - case ReorderingMode::Opcode: - // We accept both Instructions and Undefs, but with different scores. - if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) && - cast<Instruction>(Op)->getOpcode() == - cast<Instruction>(OpLastLane)->getOpcode()) || - (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) || - isa<UndefValue>(Op)) { - // An instruction has a higher score than an undef. - unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } - } - break; case ReorderingMode::Constant: - if (isa<Constant>(Op)) { - unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore; - if (Score > BestOp.Score) { - BestOp.Idx = Idx; - BestOp.Score = Score; - } + case ReorderingMode::Opcode: { + bool LeftToRight = Lane > LastLane; + Value *OpLeft = (LeftToRight) ? OpLastLane : Op; + Value *OpRight = (LeftToRight) ? Op : OpLastLane; + unsigned Score = + getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane}); + if (Score > BestOp.Score) { + BestOp.Idx = Idx; + BestOp.Score = Score; } break; + } case ReorderingMode::Splat: if (Op == OpLastLane) BestOp.Idx = Idx; @@ -946,8 +1195,8 @@ public: public: /// Initialize with all the operands of the instruction vector \p RootVL. VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL, - ScalarEvolution &SE) - : DL(DL), SE(SE) { + ScalarEvolution &SE, const BoUpSLP &R) + : DL(DL), SE(SE), R(R) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1117,6 +1366,14 @@ public: #endif }; + /// Checks if the instruction is marked for deletion. + bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } + + /// Marks values operands for later deletion by replacing them with Undefs. + void eraseInstructions(ArrayRef<Value *> AV); + + ~BoUpSLP(); + private: /// Checks if all users of \p I are the part of the vectorization tree. bool areAllUsersVectorized(Instruction *I) const; @@ -1153,8 +1410,7 @@ private: /// Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(ArrayRef<Value *> VL, - const InstructionsState &S); + void setInsertPointAfterBundle(TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty); @@ -1169,7 +1425,8 @@ private: SmallVectorImpl<Value *> &Left, SmallVectorImpl<Value *> &Right, const DataLayout &DL, - ScalarEvolution &SE); + ScalarEvolution &SE, + const BoUpSLP &R); struct TreeEntry { using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -1191,7 +1448,8 @@ private: Value *VectorizedValue = nullptr; /// Do we need to gather this sequence ? - bool NeedToGather = false; + enum EntryState { Vectorize, NeedToGather }; + EntryState State; /// Does this sequence require some shuffling? SmallVector<unsigned, 4> ReuseShuffleIndices; @@ -1220,27 +1478,37 @@ private: /// reordering of operands during buildTree_rec() and vectorizeTree(). SmallVector<ValueList, 2> Operands; + /// The main/alternate instruction. + Instruction *MainOp = nullptr; + Instruction *AltOp = nullptr; + public: /// Set this bundle's \p OpIdx'th operand to \p OpVL. - void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL, - ArrayRef<unsigned> ReuseShuffleIndices) { + void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); assert(Operands[OpIdx].size() == 0 && "Already resized?"); Operands[OpIdx].resize(Scalars.size()); for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) - Operands[OpIdx][Lane] = (!ReuseShuffleIndices.empty()) - ? OpVL[ReuseShuffleIndices[Lane]] - : OpVL[Lane]; - } - - /// If there is a user TreeEntry, then set its operand. - void trySetUserTEOperand(const EdgeInfo &UserTreeIdx, - ArrayRef<Value *> OpVL, - ArrayRef<unsigned> ReuseShuffleIndices) { - if (UserTreeIdx.UserTE) - UserTreeIdx.UserTE->setOperand(UserTreeIdx.EdgeIdx, OpVL, - ReuseShuffleIndices); + Operands[OpIdx][Lane] = OpVL[Lane]; + } + + /// Set the operands of this bundle in their original order. + void setOperandsInOrder() { + assert(Operands.empty() && "Already initialized?"); + auto *I0 = cast<Instruction>(Scalars[0]); + Operands.resize(I0->getNumOperands()); + unsigned NumLanes = Scalars.size(); + for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); + OpIdx != NumOperands; ++OpIdx) { + Operands[OpIdx].resize(NumLanes); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + auto *I = cast<Instruction>(Scalars[Lane]); + assert(I->getNumOperands() == NumOperands && + "Expected same number of operands"); + Operands[OpIdx][Lane] = I->getOperand(OpIdx); + } + } } /// \returns the \p OpIdx operand of this TreeEntry. @@ -1249,6 +1517,9 @@ private: return Operands[OpIdx]; } + /// \returns the number of operands. + unsigned getNumOperands() const { return Operands.size(); } + /// \return the single \p OpIdx operand. Value *getSingleOperand(unsigned OpIdx) const { assert(OpIdx < Operands.size() && "Off bounds"); @@ -1256,6 +1527,58 @@ private: return Operands[OpIdx][0]; } + /// Some of the instructions in the list have alternate opcodes. + bool isAltShuffle() const { + return getOpcode() != getAltOpcode(); + } + + bool isOpcodeOrAlt(Instruction *I) const { + unsigned CheckedOpcode = I->getOpcode(); + return (getOpcode() == CheckedOpcode || + getAltOpcode() == CheckedOpcode); + } + + /// Chooses the correct key for scheduling data. If \p Op has the same (or + /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is + /// \p OpValue. + Value *isOneOf(Value *Op) const { + auto *I = dyn_cast<Instruction>(Op); + if (I && isOpcodeOrAlt(I)) + return Op; + return MainOp; + } + + void setOperations(const InstructionsState &S) { + MainOp = S.MainOp; + AltOp = S.AltOp; + } + + Instruction *getMainOp() const { + return MainOp; + } + + Instruction *getAltOp() const { + return AltOp; + } + + /// The main/alternate opcodes for the list of instructions. + unsigned getOpcode() const { + return MainOp ? MainOp->getOpcode() : 0; + } + + unsigned getAltOpcode() const { + return AltOp ? AltOp->getOpcode() : 0; + } + + /// Update operations state of this entry if reorder occurred. + bool updateStateIfReorder() { + if (ReorderIndices.empty()) + return false; + InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front()); + setOperations(S); + return true; + } + #ifndef NDEBUG /// Debug printer. LLVM_DUMP_METHOD void dump() const { @@ -1268,23 +1591,40 @@ private: dbgs() << "Scalars: \n"; for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; - dbgs() << "NeedToGather: " << NeedToGather << "\n"; + dbgs() << "State: "; + switch (State) { + case Vectorize: + dbgs() << "Vectorize\n"; + break; + case NeedToGather: + dbgs() << "NeedToGather\n"; + break; + } + dbgs() << "MainOp: "; + if (MainOp) + dbgs() << *MainOp << "\n"; + else + dbgs() << "NULL\n"; + dbgs() << "AltOp: "; + if (AltOp) + dbgs() << *AltOp << "\n"; + else + dbgs() << "NULL\n"; dbgs() << "VectorizedValue: "; if (VectorizedValue) - dbgs() << *VectorizedValue; + dbgs() << *VectorizedValue << "\n"; else - dbgs() << "NULL"; - dbgs() << "\n"; + dbgs() << "NULL\n"; dbgs() << "ReuseShuffleIndices: "; if (ReuseShuffleIndices.empty()) dbgs() << "Emtpy"; else - for (unsigned Idx : ReuseShuffleIndices) - dbgs() << Idx << ", "; + for (unsigned ReuseIdx : ReuseShuffleIndices) + dbgs() << ReuseIdx << ", "; dbgs() << "\n"; dbgs() << "ReorderIndices: "; - for (unsigned Idx : ReorderIndices) - dbgs() << Idx << ", "; + for (unsigned ReorderIdx : ReorderIndices) + dbgs() << ReorderIdx << ", "; dbgs() << "\n"; dbgs() << "UserTreeIndices: "; for (const auto &EInfo : UserTreeIndices) @@ -1295,23 +1635,36 @@ private: }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, + TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle, + const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef<unsigned> ReuseShuffleIndices = None, ArrayRef<unsigned> ReorderIndices = None) { - VectorizableTree.push_back(llvm::make_unique<TreeEntry>(VectorizableTree)); + bool Vectorized = (bool)Bundle; + VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); - Last->NeedToGather = !Vectorized; + Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; + Last->setOperations(S); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = Last->Idx; - } + ScalarToTreeEntry[VL[i]] = Last; + } + // Update the scheduler bundle to point to this TreeEntry. + unsigned Lane = 0; + for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; + BundleMember = BundleMember->NextInBundle) { + BundleMember->TE = Last; + BundleMember->Lane = Lane; + ++Lane; + } + assert((!Bundle.getValue() || Lane == VL.size()) && + "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -1319,7 +1672,6 @@ private: if (UserTreeIdx.UserTE) Last->UserTreeIndices.push_back(UserTreeIdx); - Last->trySetUserTEOperand(UserTreeIdx, VL, ReuseShuffleIndices); return Last; } @@ -1340,19 +1692,19 @@ private: TreeEntry *getTreeEntry(Value *V) { auto I = ScalarToTreeEntry.find(V); if (I != ScalarToTreeEntry.end()) - return VectorizableTree[I->second].get(); + return I->second; return nullptr; } const TreeEntry *getTreeEntry(Value *V) const { auto I = ScalarToTreeEntry.find(V); if (I != ScalarToTreeEntry.end()) - return VectorizableTree[I->second].get(); + return I->second; return nullptr; } /// Maps a specific scalar to its tree entry. - SmallDenseMap<Value*, int> ScalarToTreeEntry; + SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -1408,15 +1760,14 @@ private: /// This is required to ensure that there are no incorrect collisions in the /// AliasCache, which can happen if a new instruction is allocated at the /// same address as a previously deleted instruction. - void eraseInstruction(Instruction *I) { - I->removeFromParent(); - I->dropAllReferences(); - DeletedInstructions.emplace_back(I); + void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) { + auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first; + It->getSecond() = It->getSecond() && ReplaceOpsWithUndef; } /// Temporary store for deleted instructions. Instructions will be deleted /// eventually when the BoUpSLP is destructed. - SmallVector<unique_value, 8> DeletedInstructions; + DenseMap<Instruction *, bool> DeletedInstructions; /// A list of values that need to extracted out of the tree. /// This list holds pairs of (Internal Scalar : External User). External User @@ -1453,6 +1804,8 @@ private: UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); OpValue = OpVal; + TE = nullptr; + Lane = -1; } /// Returns true if the dependency information has been calculated. @@ -1559,6 +1912,12 @@ private: /// Opcode of the current instruction in the schedule data. Value *OpValue = nullptr; + + /// The TreeEntry that this instruction corresponds to. + TreeEntry *TE = nullptr; + + /// The lane of this node in the TreeEntry. + int Lane = -1; }; #ifndef NDEBUG @@ -1615,7 +1974,7 @@ private: return nullptr; } - bool isInSchedulingRegion(ScheduleData *SD) { + bool isInSchedulingRegion(ScheduleData *SD) const { return SD->SchedulingRegionID == SchedulingRegionID; } @@ -1633,10 +1992,9 @@ private: continue; } // Handle the def-use chain dependencies. - for (Use &U : BundleMember->Inst->operands()) { - auto *I = dyn_cast<Instruction>(U.get()); - if (!I) - continue; + + // Decrement the unscheduled counter and insert to ready list if ready. + auto &&DecrUnsched = [this, &ReadyList](Instruction *I) { doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { if (OpDef && OpDef->hasValidDependencies() && OpDef->incrementUnscheduledDeps(-1) == 0) { @@ -1651,6 +2009,24 @@ private: << "SLP: gets ready (def): " << *DepBundle << "\n"); } }); + }; + + // If BundleMember is a vector bundle, its operands may have been + // reordered duiring buildTree(). We therefore need to get its operands + // through the TreeEntry. + if (TreeEntry *TE = BundleMember->TE) { + int Lane = BundleMember->Lane; + assert(Lane >= 0 && "Lane not set"); + for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands(); + OpIdx != NumOperands; ++OpIdx) + if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane])) + DecrUnsched(I); + } else { + // If BundleMember is a stand-alone instruction, no operand reordering + // has taken place, so we directly access its operands. + for (Use &U : BundleMember->Inst->operands()) + if (auto *I = dyn_cast<Instruction>(U.get())) + DecrUnsched(I); } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { @@ -1697,8 +2073,11 @@ private: /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, - const InstructionsState &S); + /// \returns the scheduling bundle. The returned Optional value is non-None + /// if \p VL is allowed to be scheduled. + Optional<ScheduleData *> + tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, + const InstructionsState &S); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue); @@ -1937,7 +2316,7 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *) { - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) return "color=red"; return ""; } @@ -1945,6 +2324,30 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { } // end namespace llvm +BoUpSLP::~BoUpSLP() { + for (const auto &Pair : DeletedInstructions) { + // Replace operands of ignored instructions with Undefs in case if they were + // marked for deletion. + if (Pair.getSecond()) { + Value *Undef = UndefValue::get(Pair.getFirst()->getType()); + Pair.getFirst()->replaceAllUsesWith(Undef); + } + Pair.getFirst()->dropAllReferences(); + } + for (const auto &Pair : DeletedInstructions) { + assert(Pair.getFirst()->use_empty() && + "trying to erase instruction with users."); + Pair.getFirst()->eraseFromParent(); + } +} + +void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) { + for (auto *V : AV) { + if (auto *I = dyn_cast<Instruction>(V)) + eraseInstruction(I, /*ReplaceWithUndef=*/true); + }; +} + void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ArrayRef<Value *> UserIgnoreLst) { ExtraValueToDebugLocsMap ExternallyUsedValues; @@ -1965,7 +2368,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) continue; // For each lane: @@ -2002,7 +2405,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(!UseEntry->NeedToGather && "Bad state"); + assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); continue; } } @@ -2026,28 +2429,28 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } @@ -2055,11 +2458,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // the same block. // Don't vectorize ephemeral values. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (EphValues.count(VL[i])) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2069,7 +2472,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -2077,19 +2480,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, E->UserTreeIndices.push_back(UserTreeIdx); LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); - E->trySetUserTEOperand(UserTreeIdx, VL, None); return; } // Check that none of the instructions in the bundle are already in the tree. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - auto *I = dyn_cast<Instruction>(VL[i]); + for (Value *V : VL) { + auto *I = dyn_cast<Instruction>(V); if (!I) continue; if (getTreeEntry(I)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2097,10 +2499,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { + for (Value *V : VL) { + if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } } @@ -2114,7 +2516,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } @@ -2128,13 +2530,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Res.second) UniqueValues.emplace_back(V); } - if (UniqueValues.size() == VL.size()) { + size_t NumUniqueScalarValues = UniqueValues.size(); + if (NumUniqueScalarValues == VL.size()) { ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { + if (NumUniqueScalarValues <= 1 || + !llvm::isPowerOf2_32(NumUniqueScalarValues)) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; } VL = UniqueValues; @@ -2142,16 +2546,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto &BSRef = BlocksSchedules[BB]; if (!BSRef) - BSRef = llvm::make_unique<BlockScheduling>(BB); + BSRef = std::make_unique<BlockScheduling>(BB); BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, S)) { + Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S); + if (!Bundle) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -2160,7 +2566,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, (unsigned) Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); + auto *PH = cast<PHINode>(VL0); // Check for terminator values (e.g. invoke). for (unsigned j = 0; j < VL.size(); ++j) @@ -2172,23 +2578,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = + newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); + // Keeps the reordered operands to avoid code duplication. + SmallVector<ValueList, 2> OperandsVec; for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *j : VL) Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - - buildTree_rec(Operands, Depth + 1, {TE, i}); + TE->setOperand(i, Operands); + OperandsVec.push_back(Operands); } + for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) + buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); return; } case Instruction::ExtractValue: @@ -2198,13 +2610,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); + VectorizableTree.back()->setOperand(0, Op0); return; } if (!CurrentOrder.empty()) { @@ -2220,17 +2632,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; Op0.assign(VL.size(), VL0->getOperand(0)); - VectorizableTree.back()->setOperand(0, Op0, ReuseShuffleIndicies); + VectorizableTree.back()->setOperand(0, Op0); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; } @@ -2246,7 +2660,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -2259,7 +2674,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, auto *L = cast<LoadInst>(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -2285,19 +2701,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, - ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } return; @@ -2306,7 +2725,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -2322,24 +2742,27 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType(); + for (Value *V : VL) { + Type *Ty = cast<Instruction>(V)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2351,19 +2774,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); Type *ComparedTy = VL0->getOperand(0)->getType(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast<CmpInst>(VL[i]); + for (Value *V : VL) { + CmpInst *Cmp = cast<CmpInst>(V); if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); ValueList Left, Right; @@ -2371,7 +2796,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { @@ -2384,7 +2809,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Right.push_back(RHS); } } - + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; @@ -2409,19 +2835,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, case Instruction::And: case Instruction::Or: case Instruction::Xor: { - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; } + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -2434,11 +2864,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast<Instruction>(VL[j])->getNumOperands() != 2) { + for (Value *V : VL) { + if (cast<Instruction>(V)->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } @@ -2446,59 +2877,120 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = VL0->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType(); + for (Value *V : VL) { + Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast<Instruction>(VL[j])->getOperand(1); - if (!isa<ConstantInt>(Op)) { + Type *Ty1 = VL0->getOperand(1)->getType(); + for (Value *V : VL) { + auto Op = cast<Instruction>(V)->getOperand(1); + if (!isa<ConstantInt>(Op) || + (Op->getType() != Ty1 && + Op->getType()->getScalarSizeInBits() > + DL->getIndexSizeInBits( + V->getType()->getPointerAddressSpace()))) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); + TE->setOperandsInOrder(); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } return; } case Instruction::Store: { - // Check if the stores are consecutive or of we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { + // Check if the stores are consecutive or if we need to swizzle them. + llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType(); + // Make sure all stores in the bundle are simple - we can't vectorize + // atomic or volatile stores. + SmallVector<Value *, 4> PointerOps(VL.size()); + ValueList Operands(VL.size()); + auto POIter = PointerOps.begin(); + auto OIter = Operands.begin(); + for (Value *V : VL) { + auto *SI = cast<StoreInst>(V); + if (!SI->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n"); return; } + *POIter = SI->getPointerOperand(); + *OIter = SI->getValueOperand(); + ++POIter; + ++OIter; + } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); - - ValueList Operands; - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(0)); + OrdersType CurrentOrder; + // Check the order of pointer operands. + if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; + } + const SCEV *Scev0 = SE->getSCEV(Ptr0); + const SCEV *ScevN = SE->getSCEV(PtrN); + const auto *Diff = + dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); + uint64_t Size = DL->getTypeAllocSize(ScalarTy); + // Check that the sorted pointer operands are consecutive. + if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (CurrentOrder.empty()) { + // Original stores are consecutive and does not require reordering. + ++NumOpsWantToKeepOriginalOrder; + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); + } else { + // Need to reorder. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++(I->getSecond()); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + TE->setOperandsInOrder(); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); + } + return; + } + } - buildTree_rec(Operands, Depth + 1, {TE, 0}); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } case Instruction::Call: { @@ -2509,7 +3001,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -2519,14 +3012,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, for (unsigned j = 0; j != NumArgs; ++j) if (hasVectorInstrinsicScalarOpd(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - CallInst *CI2 = dyn_cast<CallInst>(VL[i]); + for (Value *V : VL) { + CallInst *CI2 = dyn_cast<CallInst>(V); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n"); return; } @@ -2537,7 +3031,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Value *A1J = CI2->getArgOperand(j); if (ScalarArgs[j] != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << ScalarArgs[j] << "!=" << A1J << "\n"); @@ -2551,19 +3046,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *VL[i] << '\n'); + << *CI << "!=" << *V << '\n'); return; } } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + TE->setOperandsInOrder(); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast<CallInst>(j); + for (Value *V : VL) { + auto *CI2 = cast<CallInst>(V); Operands.push_back(CI2->getArgOperand(i)); } buildTree_rec(Operands, Depth + 1, {TE, i}); @@ -2575,27 +3073,32 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // then do not vectorize this instruction. if (!S.isAltShuffle()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - auto *TE = newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa<BinaryOperator>(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + TE->setOperand(0, Left); + TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); buildTree_rec(Right, Depth + 1, {TE, 1}); return; } + TE->setOperandsInOrder(); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast<Instruction>(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back(cast<Instruction>(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2603,34 +3106,37 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } } unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const { - unsigned N; - Type *EltTy; - auto *ST = dyn_cast<StructType>(T); - if (ST) { - N = ST->getNumElements(); - EltTy = *ST->element_begin(); - } else { - N = cast<ArrayType>(T)->getNumElements(); - EltTy = cast<ArrayType>(T)->getElementType(); + unsigned N = 1; + Type *EltTy = T; + + while (isa<CompositeType>(EltTy)) { + if (auto *ST = dyn_cast<StructType>(EltTy)) { + // Check that struct is homogeneous. + for (const auto *Ty : ST->elements()) + if (Ty != *ST->element_begin()) + return 0; + N *= ST->getNumElements(); + EltTy = *ST->element_begin(); + } else { + auto *SeqT = cast<SequentialType>(EltTy); + N *= SeqT->getNumElements(); + EltTy = SeqT->getElementType(); + } } + if (!isValidElementType(EltTy)) return 0; uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N)); if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T)) return 0; - if (ST) { - // Check that struct is homogeneous. - for (const auto *Ty : ST->elements()) - if (Ty != EltTy) - return 0; - } return N; } @@ -2731,14 +3237,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ReuseShuffleCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - if (E->NeedToGather) { + if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && + if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { @@ -2761,11 +3267,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } return ReuseShuffleCost + getGatherCost(VL); } - InstructionsState S = getSameOpcode(VL); - assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast<Instruction>(S.OpValue); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + Instruction *VL0 = E->getMainOp(); + unsigned ShuffleOrOp = + E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2800,7 +3305,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { int DeadCost = ReuseShuffleCost; if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. @@ -2851,7 +3356,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -2864,7 +3369,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0); } return VecCost - ScalarCost; } @@ -2872,14 +3377,14 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, + int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::FNeg: @@ -2940,13 +3445,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { SmallVector<const Value *, 4> Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost( + E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2967,7 +3472,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - unsigned alignment = cast<LoadInst>(VL0)->getAlignment(); + MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment()); int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { @@ -2985,15 +3490,22 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. - unsigned alignment = cast<StoreInst>(VL0)->getAlignment(); + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = + cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0); + MaybeAlign Alignment(SI->getAlignment()); int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); - if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; - } + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0); + if (NeedToShuffleReuses) + ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = - TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); + int VecStCost = TTI->getMemoryOpCost(Instruction::Store, + VecTy, Alignment, 0, VL0); + if (IsReorder) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + VecStCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { @@ -3027,11 +3539,11 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->isAltShuffle() && + ((Instruction::isBinaryOp(E->getOpcode()) && + Instruction::isBinaryOp(E->getAltOpcode())) || + (Instruction::isCast(E->getOpcode()) && + Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); int ScalarCost = 0; if (NeedToShuffleReuses) { @@ -3046,25 +3558,25 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { I, TargetTransformInfo::TCK_RecipThroughput); } } - for (Value *i : VL) { - Instruction *I = cast<Instruction>(i); - assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + for (Value *V : VL) { + Instruction *I = cast<Instruction>(V); + assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + if (Instruction::isBinaryOp(E->getOpcode())) { + VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy); + VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy); } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); + Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType(); + Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType(); VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3079,25 +3591,64 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { << VectorizableTree.size() << " is fully vectorizable .\n"); // We only handle trees of heights 1 and 2. - if (VectorizableTree.size() == 1 && !VectorizableTree[0]->NeedToGather) + if (VectorizableTree.size() == 1 && + VectorizableTree[0]->State == TreeEntry::Vectorize) return true; if (VectorizableTree.size() != 2) return false; // Handle splat and all-constants stores. - if (!VectorizableTree[0]->NeedToGather && + if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || isSplat(VectorizableTree[1]->Scalars))) return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0]->NeedToGather || VectorizableTree[1]->NeedToGather) + if (VectorizableTree[0]->State == TreeEntry::NeedToGather || + VectorizableTree[1]->State == TreeEntry::NeedToGather) return false; return true; } +bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const { + if (RdxOpcode != Instruction::Or) + return false; + + unsigned NumElts = VectorizableTree[0]->Scalars.size(); + Value *FirstReduced = VectorizableTree[0]->Scalars[0]; + + // Look past the reduction to find a source value. Arbitrarily follow the + // path through operand 0 of any 'or'. Also, peek through optional + // shift-left-by-constant. + Value *ZextLoad = FirstReduced; + while (match(ZextLoad, m_Or(m_Value(), m_Value())) || + match(ZextLoad, m_Shl(m_Value(), m_Constant()))) + ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); + + // Check if the input to the reduction is an extended load. + Value *LoadPtr; + if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + return false; + + // Require that the total load bit width is a legal integer type. + // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. + // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. + Type *SrcTy = LoadPtr->getType()->getPointerElementType(); + unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; + LLVMContext &Context = FirstReduced->getContext(); + if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth))) + return false; + + // Everything matched - assume that we can fold the whole sequence using + // load combining. + LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of " + << *(cast<Instruction>(FirstReduced)) << "\n"); + + return true; +} + bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. @@ -3165,7 +3716,7 @@ int BoUpSLP::getSpillCost() const { continue; } - // Debug informations don't impact spill cost. + // Debug information does not impact spill cost. if ((isa<CallInst>(&*PrevInstIt) && !isa<DbgInfoIntrinsic>(&*PrevInstIt)) && &*PrevInstIt != PrevInst) @@ -3209,12 +3760,13 @@ int BoUpSLP::getTreeCost() { // their uses. Since such an approach results in fewer total entries, // existing heuristics based on tree size may yield different results. // - if (TE.NeedToGather && - std::any_of( - std::next(VectorizableTree.begin(), I + 1), VectorizableTree.end(), - [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { - return EntryPtr->NeedToGather && EntryPtr->isSame(TE.Scalars); - })) + if (TE.State == TreeEntry::NeedToGather && + std::any_of(std::next(VectorizableTree.begin(), I + 1), + VectorizableTree.end(), + [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { + return EntryPtr->State == TreeEntry::NeedToGather && + EntryPtr->isSame(TE.Scalars); + })) continue; int C = getEntryCost(&TE); @@ -3306,29 +3858,31 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode( - ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left, - SmallVectorImpl<Value *> &Right, const DataLayout &DL, - ScalarEvolution &SE) { +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, + SmallVectorImpl<Value *> &Left, + SmallVectorImpl<Value *> &Right, + const DataLayout &DL, + ScalarEvolution &SE, + const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE); + VLOperands Ops(VL, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); Right = Ops.getVL(1); } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, - const InstructionsState &S) { +void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. - auto *Front = cast<Instruction>(S.OpValue); + auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { - auto *I = cast<Instruction>(V); - return !S.isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), + [=](Value *V) -> bool { + auto *I = cast<Instruction>(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -3339,7 +3893,7 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -3365,14 +3919,15 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end()); + SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I)) + if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I)) LastInst = &I; if (Bundle.empty()) break; } } + assert(LastInst && "Failed to find last instruction in bundle"); // Set the insertion point after the last instruction in the bundle. Set the // debug location to Front. @@ -3385,7 +3940,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) { // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); - if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) { + if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) { GatherSeq.insert(Insrt); CSEBlocks.insert(Insrt->getParent()); @@ -3494,8 +4049,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast<Instruction>(S.OpValue); + Instruction *VL0 = E->getMainOp(); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast<StoreInst>(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3503,8 +4057,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, S); + if (E->State == TreeEntry::NeedToGather) { + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3518,11 +4072,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = + E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { - PHINode *PH = dyn_cast<PHINode>(VL0); + auto *PH = cast<PHINode>(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); @@ -3558,7 +4112,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::ExtractElement: { - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { Value *V = E->getSingleOperand(0); if (!E->ReorderIndices.empty()) { OrdersType Mask; @@ -3577,7 +4131,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3591,7 +4145,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ExtractValue: { - if (!E->NeedToGather) { + if (E->State == TreeEntry::Vectorize) { LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); @@ -3612,7 +4166,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { E->VectorizedValue = NewV; return NewV; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3637,7 +4191,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *InVec = vectorizeTree(E->getOperand(0)); @@ -3646,7 +4200,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return E->VectorizedValue; } - CastInst *CI = dyn_cast<CastInst>(VL0); + auto *CI = cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3658,7 +4212,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::FCmp: case Instruction::ICmp: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *L = vectorizeTree(E->getOperand(0)); Value *R = vectorizeTree(E->getOperand(1)); @@ -3670,7 +4224,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V; - if (S.getOpcode() == Instruction::FCmp) + if (E->getOpcode() == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3685,7 +4239,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Select: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Cond = vectorizeTree(E->getOperand(0)); Value *True = vectorizeTree(E->getOperand(1)); @@ -3706,7 +4260,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::FNeg: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Op = vectorizeTree(E->getOperand(0)); @@ -3716,7 +4270,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateUnOp( - static_cast<Instruction::UnaryOps>(S.getOpcode()), Op); + static_cast<Instruction::UnaryOps>(E->getOpcode()), Op); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); @@ -3748,7 +4302,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *LHS = vectorizeTree(E->getOperand(0)); Value *RHS = vectorizeTree(E->getOperand(1)); @@ -3759,7 +4313,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, + RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); @@ -3776,12 +4331,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - bool IsReorder = !E->ReorderIndices.empty(); - if (IsReorder) { - S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); - VL0 = cast<Instruction>(S.OpValue); - } - setInsertPointAfterBundle(E->Scalars, S); + bool IsReorder = E->updateStateIfReorder(); + if (IsReorder) + VL0 = E->getMainOp(); + setInsertPointAfterBundle(E); LoadInst *LI = cast<LoadInst>(VL0); Type *ScalarLoadTy = LI->getType(); @@ -3797,11 +4350,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (getTreeEntry(PO)) ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0)); - unsigned Alignment = LI->getAlignment(); + MaybeAlign Alignment = MaybeAlign(LI->getAlignment()); LI = Builder.CreateLoad(VecTy, VecPtr); - if (!Alignment) { - Alignment = DL->getABITypeAlignment(ScalarLoadTy); - } + if (!Alignment) + Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy)); LI->setAlignment(Alignment); Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { @@ -3820,15 +4372,25 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(VL0); + bool IsReorder = !E->ReorderIndices.empty(); + auto *SI = cast<StoreInst>( + IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); + if (IsReorder) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + VecValue = Builder.CreateShuffleVector( + VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices, + "reorder_shuffle"); + } Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); + Value *VecPtr = Builder.CreateBitCast( + ScalarPtr, VecValue->getType()->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); // The pointer operand uses an in-tree scalar, so add the new BitCast to @@ -3840,7 +4402,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - ST->setAlignment(Alignment); + ST->setAlignment(Align(Alignment)); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3851,14 +4413,29 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); Value *Op0 = vectorizeTree(E->getOperand(0)); std::vector<Value *> OpVecs; for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e; ++j) { - Value *OpVec = vectorizeTree(E->getOperand(j)); + ValueList &VL = E->getOperand(j); + // Need to cast all elements to the same type before vectorization to + // avoid crash. + Type *VL0Ty = VL0->getOperand(j)->getType(); + Type *Ty = llvm::all_of( + VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); }) + ? VL0Ty + : DL->getIndexType(cast<GetElementPtrInst>(VL0) + ->getPointerOperandType() + ->getScalarType()); + for (Value *&V : VL) { + auto *CI = cast<ConstantInt>(V); + V = ConstantExpr::getIntegerCast(CI, Ty, + CI->getValue().isSignBitSet()); + } + Value *OpVec = vectorizeTree(VL); OpVecs.push_back(OpVec); } @@ -3878,13 +4455,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); - setInsertPointAfterBundle(E->Scalars, S); - Function *FI; + setInsertPointAfterBundle(E); + Intrinsic::ID IID = Intrinsic::not_intrinsic; - Value *ScalarArg = nullptr; - if (CI && (FI = CI->getCalledFunction())) { + if (Function *FI = CI->getCalledFunction()) IID = FI->getIntrinsicID(); - } + + Value *ScalarArg = nullptr; std::vector<Value *> OpVecs; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; @@ -3926,20 +4503,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { return V; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->isAltShuffle() && + ((Instruction::isBinaryOp(E->getOpcode()) && + Instruction::isBinaryOp(E->getAltOpcode())) || + (Instruction::isCast(E->getOpcode()) && + Instruction::isCast(E->getAltOpcode()))) && "Invalid Shuffle Vector Operand"); - Value *LHS, *RHS; - if (Instruction::isBinaryOp(S.getOpcode())) { - setInsertPointAfterBundle(E->Scalars, S); + Value *LHS = nullptr, *RHS = nullptr; + if (Instruction::isBinaryOp(E->getOpcode())) { + setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); RHS = vectorizeTree(E->getOperand(1)); } else { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E); LHS = vectorizeTree(E->getOperand(0)); } @@ -3949,16 +4526,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V0, *V1; - if (Instruction::isBinaryOp(S.getOpcode())) { + if (Instruction::isBinaryOp(E->getOpcode())) { V0 = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( - static_cast<Instruction::BinaryOps>(S.getAltOpcode()), LHS, RHS); + static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); } else { V0 = Builder.CreateCast( - static_cast<Instruction::CastOps>(S.getOpcode()), LHS, VecTy); + static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy); V1 = Builder.CreateCast( - static_cast<Instruction::CastOps>(S.getAltOpcode()), LHS, VecTy); + static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy); } // Create shuffle to take alternate operations from the vector. @@ -3969,8 +4546,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { SmallVector<Constant *, 8> Mask(e); for (unsigned i = 0; i < e; ++i) { auto *OpInst = cast<Instruction>(E->Scalars[i]); - assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == S.getAltOpcode()) { + assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); + if (OpInst->getOpcode() == E->getAltOpcode()) { Mask[i] = Builder.getInt32(e + i); AltScalars.push_back(E->Scalars[i]); } else { @@ -4054,7 +4631,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { continue; TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); - assert(!E->NeedToGather && "Extracting from a gather list"); + assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list"); Value *Vec = E->VectorizedValue; assert(Vec && "Can't find vectorizable value"); @@ -4127,7 +4704,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (Entry->State == TreeEntry::NeedToGather) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); @@ -4136,20 +4713,18 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; +#ifndef NDEBUG Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { -#ifndef NDEBUG for (User *U : Scalar->users()) { LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - // It is legal to replace users in the ignorelist by undef. + // It is legal to delete users in the ignorelist. assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && - "Replacing out-of-tree value with undef"); + "Deleting out-of-tree value"); } -#endif - Value *Undef = UndefValue::get(Ty); - Scalar->replaceAllUsesWith(Undef); } +#endif LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); eraseInstruction(cast<Instruction>(Scalar)); } @@ -4165,7 +4740,7 @@ void BoUpSLP::optimizeGatherSequence() { << " gather sequences instructions.\n"); // LICM InsertElementInst sequences. for (Instruction *I : GatherSeq) { - if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I)) + if (isDeleted(I)) continue; // Check if this block is inside a loop. @@ -4219,6 +4794,8 @@ void BoUpSLP::optimizeGatherSequence() { // For all instructions in blocks containing gather sequences: for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) { Instruction *In = &*it++; + if (isDeleted(In)) + continue; if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In)) continue; @@ -4245,11 +4822,11 @@ void BoUpSLP::optimizeGatherSequence() { // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. -bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, - BoUpSLP *SLP, - const InstructionsState &S) { +Optional<BoUpSLP::ScheduleData *> +BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, + const InstructionsState &S) { if (isa<PHINode>(S.OpValue)) - return true; + return nullptr; // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; @@ -4262,7 +4839,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, // instructions of the bundle. for (Value *V : VL) { if (!extendSchedulingRegion(V, S)) - return false; + return None; } for (Value *V : VL) { @@ -4308,6 +4885,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, resetSchedule(); initialFillReadyList(ReadyInsts); } + assert(Bundle && "Failed to find schedule bundle"); LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " << BB->getName() << "\n"); @@ -4329,9 +4907,9 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, } if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); - return false; + return None; } - return true; + return Bundle; } void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, @@ -4364,7 +4942,7 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { // Allocate a new ScheduleData for the instruction. if (ChunkPos >= ChunkSize) { - ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize)); + ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize)); ChunkPos = 0; } return &(ScheduleDataChunks.back()[ChunkPos++]); @@ -4977,7 +5555,7 @@ struct SLPVectorizer : public FunctionPass { auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); - auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); @@ -5052,7 +5630,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // If the target claims to have no vector registers don't attempt // vectorization. - if (!TTI->getNumberOfRegisters(true)) + if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) return false; // Don't vectorize when the attribute NoImplicitFloat is used. @@ -5100,139 +5678,141 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, return Changed; } -/// Check that the Values in the slice in VL array are still existent in -/// the WeakTrackingVH array. -/// Vectorization of part of the VL array may cause later values in the VL array -/// to become invalid. We track when this has happened in the WeakTrackingVH -/// array. -static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, - ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin, - unsigned SliceSize) { - VL = VL.slice(SliceBegin, SliceSize); - VH = VH.slice(SliceBegin, SliceSize); - return !std::equal(VL.begin(), VL.end(), VH.begin()); -} - bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, - unsigned VecRegSize) { - const unsigned ChainLen = Chain.size(); - LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen + unsigned Idx) { + LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() << "\n"); const unsigned Sz = R.getVectorElementSize(Chain[0]); - const unsigned VF = VecRegSize / Sz; + const unsigned MinVF = R.getMinVecRegSize() / Sz; + unsigned VF = Chain.size(); - if (!isPowerOf2_32(Sz) || VF < 2) + if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) return false; - // Keep track of values that were deleted by vectorizing in the loop below. - const SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end()); - - bool Changed = false; - // Look for profitable vectorizable trees at all offsets, starting at zero. - for (unsigned i = 0, e = ChainLen; i + VF <= e; ++i) { - - // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(Chain, TrackValues, i, VF)) - continue; - - LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i - << "\n"); - ArrayRef<Value *> Operands = Chain.slice(i, VF); - - R.buildTree(Operands); - if (R.isTreeTinyAndNotFullyVectorizable()) - continue; + LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx + << "\n"); - R.computeMinimumValueSizes(); + R.buildTree(Chain); + Optional<ArrayRef<unsigned>> Order = R.bestOrder(); + // TODO: Handle orders of size less than number of elements in the vector. + if (Order && Order->size() == Chain.size()) { + // TODO: reorder tree nodes without tree rebuilding. + SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend()); + llvm::transform(*Order, ReorderedOps.begin(), + [Chain](const unsigned Idx) { return Chain[Idx]; }); + R.buildTree(ReorderedOps); + } + if (R.isTreeTinyAndNotFullyVectorizable()) + return false; - int Cost = R.getTreeCost(); + R.computeMinimumValueSizes(); - LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF - << "\n"); - if (Cost < -SLPCostThreshold) { - LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + int Cost = R.getTreeCost(); - using namespace ore; + LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); + if (Cost < -SLPCostThreshold) { + LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); - R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", - cast<StoreInst>(Chain[i])) - << "Stores SLP vectorized with cost " << NV("Cost", Cost) - << " and with tree size " - << NV("TreeSize", R.getTreeSize())); + using namespace ore; - R.vectorizeTree(); + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", + cast<StoreInst>(Chain[0])) + << "Stores SLP vectorized with cost " << NV("Cost", Cost) + << " and with tree size " + << NV("TreeSize", R.getTreeSize())); - // Move to the next bundle. - i += VF - 1; - Changed = true; - } + R.vectorizeTree(); + return true; } - return Changed; + return false; } bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, BoUpSLP &R) { - SetVector<StoreInst *> Heads; - SmallDenseSet<StoreInst *> Tails; - SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; - // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - auto &&FindConsecutiveAccess = - [this, &Stores, &Heads, &Tails, &ConsecutiveChain] (int K, int Idx) { - if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) - return false; - - Tails.insert(Stores[Idx]); - Heads.insert(Stores[K]); - ConsecutiveChain[Stores[K]] = Stores[Idx]; - return true; - }; + int E = Stores.size(); + SmallBitVector Tails(E, false); + SmallVector<int, 16> ConsecutiveChain(E, E + 1); + int MaxIter = MaxStoreLookup.getValue(); + int IterCnt; + auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, + &ConsecutiveChain](int K, int Idx) { + if (IterCnt >= MaxIter) + return true; + ++IterCnt; + if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) + return false; + Tails.set(Idx); + ConsecutiveChain[K] = Idx; + return true; + }; // Do a quadratic search on all of the given stores in reverse order and find // all of the pairs of stores that follow each other. - int E = Stores.size(); for (int Idx = E - 1; Idx >= 0; --Idx) { // If a store has multiple consecutive store candidates, search according // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ... // This is because usually pairing with immediate succeeding or preceding // candidate create the best chance to find slp vectorization opportunity. - for (int Offset = 1, F = std::max(E - Idx, Idx + 1); Offset < F; ++Offset) + const int MaxLookDepth = std::max(E - Idx, Idx + 1); + IterCnt = 0; + for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset) if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) || (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx))) break; } // For stores that start but don't end a link in the chain: - for (auto *SI : llvm::reverse(Heads)) { - if (Tails.count(SI)) + for (int Cnt = E; Cnt > 0; --Cnt) { + int I = Cnt - 1; + if (ConsecutiveChain[I] == E + 1 || Tails.test(I)) continue; - // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; - StoreInst *I = SI; // Collect the chain into a list. - while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) { - Operands.push_back(I); + while (I != E + 1 && !VectorizedStores.count(Stores[I])) { + Operands.push_back(Stores[I]); // Move to the next value in the chain. I = ConsecutiveChain[I]; } + // If a vector register can't hold 1 element, we are done. + unsigned MaxVecRegSize = R.getMaxVecRegSize(); + unsigned EltSize = R.getVectorElementSize(Stores[0]); + if (MaxVecRegSize % EltSize != 0) + continue; + + unsigned MaxElts = MaxVecRegSize / EltSize; // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? - for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); - Size /= 2) { - if (vectorizeStoreChain(Operands, R, Size)) { - // Mark the vectorized stores so that we don't vectorize them again. - VectorizedStores.insert(Operands.begin(), Operands.end()); - Changed = true; - break; + unsigned StartIdx = 0; + for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { + ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); + if (!VectorizedStores.count(Slice.front()) && + !VectorizedStores.count(Slice.back()) && + vectorizeStoreChain(Slice, R, Cnt)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Slice.begin(), Slice.end()); + Changed = true; + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += Size; + Cnt += Size; + continue; + } + ++Cnt; } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Operands.size()) + break; } } @@ -5329,12 +5909,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool CandidateFound = false; int MinCost = SLPCostThreshold; - // Keep track of values that were deleted by vectorizing in the loop below. - SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end()); - unsigned NextInst = 0, MaxInst = VL.size(); - for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; - VF /= 2) { + for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). @@ -5352,13 +5928,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) break; + ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); // Check that a previous iteration of this loop did not delete the Value. - if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth)) + if (llvm::any_of(Ops, [&R](Value *V) { + auto *I = dyn_cast<Instruction>(V); + return I && R.isDeleted(I); + })) continue; LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n"); - ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); R.buildTree(Ops); Optional<ArrayRef<unsigned>> Order = R.bestOrder(); @@ -5571,7 +6150,7 @@ class HorizontalReduction { Value *createOp(IRBuilder<> &Builder, const Twine &Name) const { assert(isVectorizable() && "Expected add|fadd or min/max reduction operation."); - Value *Cmp; + Value *Cmp = nullptr; switch (Kind) { case RK_Arithmetic: return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS, @@ -5579,23 +6158,23 @@ class HorizontalReduction { case RK_Min: Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS) : Builder.CreateFCmpOLT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_Max: Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS) : Builder.CreateFCmpOGT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_UMin: assert(Opcode == Instruction::ICmp && "Expected integer types."); Cmp = Builder.CreateICmpULT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_UMax: assert(Opcode == Instruction::ICmp && "Expected integer types."); Cmp = Builder.CreateICmpUGT(LHS, RHS); - break; + return Builder.CreateSelect(Cmp, LHS, RHS, Name); case RK_None: - llvm_unreachable("Unknown reduction operation."); + break; } - return Builder.CreateSelect(Cmp, LHS, RHS, Name); + llvm_unreachable("Unknown reduction operation."); } public: @@ -5618,38 +6197,36 @@ class HorizontalReduction { explicit operator bool() const { return Opcode; } - /// Get the index of the first operand. - unsigned getFirstOperandIndex() const { - assert(!!*this && "The opcode is not set."); + /// Return true if this operation is any kind of minimum or maximum. + bool isMinMax() const { switch (Kind) { + case RK_Arithmetic: + return false; case RK_Min: - case RK_UMin: case RK_Max: + case RK_UMin: case RK_UMax: - return 1; - case RK_Arithmetic: + return true; case RK_None: break; } - return 0; + llvm_unreachable("Reduction kind is not set"); + } + + /// Get the index of the first operand. + unsigned getFirstOperandIndex() const { + assert(!!*this && "The opcode is not set."); + // We allow calling this before 'Kind' is set, so handle that specially. + if (Kind == RK_None) + return 0; + return isMinMax() ? 1 : 0; } /// Total number of operands in the reduction operation. unsigned getNumberOfOperands() const { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - return 2; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: - return 3; - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return isMinMax() ? 3 : 2; } /// Checks if the operation has the same parent as \p P. @@ -5658,79 +6235,46 @@ class HorizontalReduction { "Expected reduction operation."); if (!IsRedOp) return I->getParent() == P; - switch (Kind) { - case RK_Arithmetic: - // Arithmetic reduction operation must be used once only. - return I->getParent() == P; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: { + if (isMinMax()) { // SelectInst must be used twice while the condition op must have single // use only. auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); return I->getParent() == P && Cmp && Cmp->getParent() == P; } - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + // Arithmetic reduction operation must be used once only. + return I->getParent() == P; } + /// Expected number of uses for reduction operations/reduced values. bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - return I->hasOneUse(); - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) return I->hasNUses(2) && (!IsReductionOp || cast<SelectInst>(I)->getCondition()->hasOneUse()); - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return I->hasOneUse(); } /// Initializes the list of reduction operations. void initReductionOps(ReductionOpsListType &ReductionOps) { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - ReductionOps.assign(1, ReductionOpsType()); - break; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) ReductionOps.assign(2, ReductionOpsType()); - break; - case RK_None: - llvm_unreachable("Reduction kind is not set"); - } + else + ReductionOps.assign(1, ReductionOpsType()); } + /// Add all reduction operations for the reduction instruction \p I. void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) { assert(Kind != RK_None && !!*this && LHS && RHS && "Expected reduction operation."); - switch (Kind) { - case RK_Arithmetic: - ReductionOps[0].emplace_back(I); - break; - case RK_Min: - case RK_UMin: - case RK_Max: - case RK_UMax: + if (isMinMax()) { ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); ReductionOps[1].emplace_back(I); - break; - case RK_None: - llvm_unreachable("Reduction kind is not set"); + } else { + ReductionOps[0].emplace_back(I); } } @@ -5763,12 +6307,12 @@ class HorizontalReduction { /// Checks if two operation data are both a reduction op or both a reduced /// value. - bool operator==(const OperationData &OD) { + bool operator==(const OperationData &OD) const { assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) && "One of the comparing operations is incorrect."); return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode); } - bool operator!=(const OperationData &OD) { return !(*this == OD); } + bool operator!=(const OperationData &OD) const { return !(*this == OD); } void clear() { Opcode = 0; LHS = nullptr; @@ -5788,18 +6332,7 @@ class HorizontalReduction { Value *getLHS() const { return LHS; } Value *getRHS() const { return RHS; } Type *getConditionType() const { - switch (Kind) { - case RK_Arithmetic: - return nullptr; - case RK_Min: - case RK_Max: - case RK_UMin: - case RK_UMax: - return CmpInst::makeCmpResultType(LHS->getType()); - case RK_None: - break; - } - llvm_unreachable("Reduction kind is not set"); + return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr; } /// Creates reduction operation with the current opcode with the IR flags @@ -6183,6 +6716,18 @@ public: assert(Pair.first && "DebugLoc must be set."); ExternallyUsedValues[Pair.second].push_back(Pair.first); } + + // The compare instruction of a min/max is the insertion point for new + // instructions and may be replaced with a new compare instruction. + auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) { + assert(isa<SelectInst>(RdxRootInst) && + "Expected min/max reduction to have select root instruction"); + Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition(); + assert(isa<Instruction>(ScalarCond) && + "Expected min/max reduction to have compare condition"); + return cast<Instruction>(ScalarCond); + }; + // The reduction root is used as the insertion point for new instructions, // so set it as externally used to prevent it from being deleted. ExternallyUsedValues[ReductionRoot]; @@ -6203,6 +6748,8 @@ public: } if (V.isTreeTinyAndNotFullyVectorizable()) break; + if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode())) + break; V.computeMinimumValueSizes(); @@ -6236,8 +6783,14 @@ public: DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc(); Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); - // Emit a reduction. - Builder.SetInsertPoint(cast<Instruction>(ReductionRoot)); + // Emit a reduction. For min/max, the root is a select, but the insertion + // point is the compare condition of that select. + Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); + if (ReductionData.isMinMax()) + Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); + else + Builder.SetInsertPoint(RdxRootInst); + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); if (VectorizedTree) { @@ -6273,8 +6826,23 @@ public: VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I); } } - // Update users. + + // Update users. For a min/max reduction that ends with a compare and + // select, we also have to RAUW for the compare instruction feeding the + // reduction root. That's because the original compare may have extra uses + // besides the final select of the reduction. + if (ReductionData.isMinMax()) { + if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) { + Instruction *ScalarCmp = + getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot)); + ScalarCmp->replaceAllUsesWith(VecSelect->getCondition()); + } + } ReductionRoot->replaceAllUsesWith(VectorizedTree); + + // Mark all scalar reduction ops for deletion, they are replaced by the + // vector reductions. + V.eraseInstructions(IgnoreList); } return VectorizedTree != nullptr; } @@ -6323,7 +6891,7 @@ private: IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost; int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost; - int ScalarReduxCost; + int ScalarReduxCost = 0; switch (ReductionData.getKind()) { case RK_Arithmetic: ScalarReduxCost = @@ -6397,46 +6965,54 @@ private: /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 -/// starting from the last insertelement instruction. +/// starting from the last insertelement or insertvalue instruction. /// -/// Returns true if it matches -static bool findBuildVector(InsertElementInst *LastInsertElem, - TargetTransformInfo *TTI, - SmallVectorImpl<Value *> &BuildVectorOpds, - int &UserCost) { - UserCost = 0; - Value *V = nullptr; - do { - if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) { - UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, - LastInsertElem->getType(), - CI->getZExtValue()); - } - BuildVectorOpds.push_back(LastInsertElem->getOperand(1)); - V = LastInsertElem->getOperand(0); - if (isa<UndefValue>(V)) - break; - LastInsertElem = dyn_cast<InsertElementInst>(V); - if (!LastInsertElem || !LastInsertElem->hasOneUse()) - return false; - } while (true); - std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); - return true; -} - -/// Like findBuildVector, but looks for construction of aggregate. +/// Also recognize aggregates like {<2 x float>, <2 x float>}, +/// {{float, float}, {float, float}}, [2 x {float, float}] and so on. +/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. +/// +/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. /// /// \return true if it matches. -static bool findBuildAggregate(InsertValueInst *IV, - SmallVectorImpl<Value *> &BuildVectorOpds) { - Value *V; +static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI, + SmallVectorImpl<Value *> &BuildVectorOpds, + int &UserCost) { + assert((isa<InsertElementInst>(LastInsertInst) || + isa<InsertValueInst>(LastInsertInst)) && + "Expected insertelement or insertvalue instruction!"); + UserCost = 0; do { - BuildVectorOpds.push_back(IV->getInsertedValueOperand()); - V = IV->getAggregateOperand(); - if (isa<UndefValue>(V)) + Value *InsertedOperand; + if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) { + InsertedOperand = IE->getOperand(1); + LastInsertInst = IE->getOperand(0); + if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { + UserCost += TTI->getVectorInstrCost(Instruction::InsertElement, + IE->getType(), CI->getZExtValue()); + } + } else { + auto *IV = cast<InsertValueInst>(LastInsertInst); + InsertedOperand = IV->getInsertedValueOperand(); + LastInsertInst = IV->getAggregateOperand(); + } + if (isa<InsertElementInst>(InsertedOperand) || + isa<InsertValueInst>(InsertedOperand)) { + int TmpUserCost; + SmallVector<Value *, 8> TmpBuildVectorOpds; + if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds, + TmpUserCost)) + return false; + BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), + TmpBuildVectorOpds.rend()); + UserCost += TmpUserCost; + } else { + BuildVectorOpds.push_back(InsertedOperand); + } + if (isa<UndefValue>(LastInsertInst)) break; - IV = dyn_cast<InsertValueInst>(V); - if (!IV || !IV->hasOneUse()) + if ((!isa<InsertValueInst>(LastInsertInst) && + !isa<InsertElementInst>(LastInsertInst)) || + !LastInsertInst->hasOneUse()) return false; } while (true); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); @@ -6530,18 +7106,13 @@ static bool tryToVectorizeHorReductionOrInstOperands( // horizontal reduction. // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. - SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0}); + SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0}); SmallPtrSet<Value *, 8> VisitedInstrs; bool Res = false; while (!Stack.empty()) { - Value *V; + Instruction *Inst; unsigned Level; - std::tie(V, Level) = Stack.pop_back_val(); - if (!V) - continue; - auto *Inst = dyn_cast<Instruction>(V); - if (!Inst) - continue; + std::tie(Inst, Level) = Stack.pop_back_val(); auto *BI = dyn_cast<BinaryOperator>(Inst); auto *SI = dyn_cast<SelectInst>(Inst); if (BI || SI) { @@ -6582,8 +7153,8 @@ static bool tryToVectorizeHorReductionOrInstOperands( for (auto *Op : Inst->operand_values()) if (VisitedInstrs.insert(Op).second) if (auto *I = dyn_cast<Instruction>(Op)) - if (!isa<PHINode>(I) && I->getParent() == BB) - Stack.emplace_back(Op, Level); + if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB) + Stack.emplace_back(I, Level); } return Res; } @@ -6609,25 +7180,26 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, BoUpSLP &R) { + int UserCost = 0; const DataLayout &DL = BB->getModule()->getDataLayout(); if (!R.canMapToVector(IVI->getType(), DL)) return false; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildAggregate(IVI, BuildVectorOpds)) + if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost)) return false; LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R); + return tryToVectorizeList(BuildVectorOpds, R, UserCost); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { int UserCost; SmallVector<Value *, 16> BuildVectorOpds; - if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) || + if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa<ExtractElementInst>(V); }) && isShuffle(BuildVectorOpds))) @@ -6652,11 +7224,10 @@ bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, } bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) { + SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) { bool OpsChanged = false; - for (auto &VH : reverse(Instructions)) { - auto *I = dyn_cast_or_null<Instruction>(VH); - if (!I) + for (auto *I : reverse(Instructions)) { + if (R.isDeleted(I)) continue; if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); @@ -6685,7 +7256,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (!P) break; - if (!VisitedInstrs.count(P)) + if (!VisitedInstrs.count(P) && !R.isDeleted(P)) Incoming.push_back(P); } @@ -6729,9 +7300,12 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { VisitedInstrs.clear(); - SmallVector<WeakVH, 8> PostProcessInstructions; + SmallVector<Instruction *, 8> PostProcessInstructions; SmallDenseSet<Instruction *, 4> KeyNodes; for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // Skip instructions marked for the deletion. + if (R.isDeleted(&*it)) + continue; // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { if (it->use_empty() && KeyNodes.count(&*it) > 0 && @@ -6811,10 +7385,16 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " << Entry.second.size() << ".\n"); - // We process the getelementptr list in chunks of 16 (like we do for - // stores) to minimize compile-time. - for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) { - auto Len = std::min<unsigned>(BE - BI, 16); + // Process the GEP list in chunks suitable for the target's supported + // vector size. If a vector register can't hold 1 element, we are done. + unsigned MaxVecRegSize = R.getMaxVecRegSize(); + unsigned EltSize = R.getVectorElementSize(Entry.second[0]); + if (MaxVecRegSize < EltSize) + continue; + + unsigned MaxElts = MaxVecRegSize / EltSize; + for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { + auto Len = std::min<unsigned>(BE - BI, MaxElts); auto GEPList = makeArrayRef(&Entry.second[BI], Len); // Initialize a set a candidate getelementptrs. Note that we use a @@ -6824,10 +7404,10 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { SetVector<Value *> Candidates(GEPList.begin(), GEPList.end()); // Some of the candidates may have already been vectorized after we - // initially collected them. If so, the WeakTrackingVHs will have - // nullified the - // values, so remove them from the set of candidates. - Candidates.remove(nullptr); + // initially collected them. If so, they are marked as deleted, so remove + // them from the set of candidates. + Candidates.remove_if( + [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); }); // Remove from the set of candidates all pairs of getelementptrs with // constant differences. Such getelementptrs are likely not good @@ -6835,18 +7415,18 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { // computed from the other. We also ensure all candidate getelementptr // indices are unique. for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { - auto *GEPI = cast<GetElementPtrInst>(GEPList[I]); + auto *GEPI = GEPList[I]; if (!Candidates.count(GEPI)) continue; auto *SCEVI = SE->getSCEV(GEPList[I]); for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { - auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]); + auto *GEPJ = GEPList[J]; auto *SCEVJ = SE->getSCEV(GEPList[J]); if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) { - Candidates.remove(GEPList[I]); - Candidates.remove(GEPList[J]); + Candidates.remove(GEPI); + Candidates.remove(GEPJ); } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { - Candidates.remove(GEPList[J]); + Candidates.remove(GEPJ); } } } @@ -6894,14 +7474,7 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << it->second.size() << ".\n"); - // Process the stores in chunks of 16. - // TODO: The limit of 16 inhibits greater vectorization factors. - // For example, AVX2 supports v32i8. Increasing this limit, however, - // may cause a significant compile-time increase. - for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI += 16) { - unsigned Len = std::min<unsigned>(CE - CI, 16); - Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R); - } + Changed |= vectorizeStores(it->second, R); } return Changed; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index 0ca6a6b93cfd..598fb00e956e 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -47,6 +47,24 @@ class VPRecipeBuilder { EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; + // VPlan-VPlan transformations support: Hold a mapping from ingredients to + // their recipe. To save on memory, only do so for selected ingredients, + // marked by having a nullptr entry in this map. If those ingredients get a + // VPWidenRecipe, also avoid compressing other ingredients into it to avoid + // having to split such recipes later. + DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe; + VPWidenRecipe *LastExtensibleRecipe = nullptr; + + /// Set the recipe created for given ingredient. This operation is a no-op for + /// ingredients that were not marked using a nullptr entry in the map. + void setRecipe(Instruction *I, VPRecipeBase *R) { + if (!Ingredient2Recipe.count(I)) + return; + assert(Ingredient2Recipe[I] == nullptr && + "Recipe already set for ingredient"); + Ingredient2Recipe[I] = R; + } + public: /// A helper function that computes the predicate of the block BB, assuming /// that the header block of the loop is set to True. It returns the *entry* @@ -57,16 +75,22 @@ public: /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); - /// Check if \I belongs to an Interleave Group within the given VF \p Range, - /// \return true in the first returned value if so and false otherwise. - /// Build a new VPInterleaveGroup Recipe if \I is the primary member of an IG - /// for \p Range.Start, and provide it as the second returned value. - /// Note that if \I is an adjunct member of an IG for \p Range.Start, the - /// \return value is <true, nullptr>, as it is handled by another recipe. - /// \p Range.End may be decreased to ensure same decision from \p Range.Start - /// to \p Range.End. - VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan); + /// Mark given ingredient for recording its recipe once one is created for + /// it. + void recordRecipeOf(Instruction *I) { + assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) && + "Recipe already set for ingredient"); + Ingredient2Recipe[I] = nullptr; + } + + /// Return the recipe created for given ingredient. + VPRecipeBase *getRecipe(Instruction *I) { + assert(Ingredient2Recipe.count(I) && + "Recording this ingredients recipe was not requested"); + assert(Ingredient2Recipe[I] != nullptr && + "Ingredient doesn't have a recipe"); + return Ingredient2Recipe[I]; + } /// Check if \I is a memory instruction to be widened for \p Range.Start and /// potentially masked. Such instructions are handled by a recipe that takes diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp index 517d759d7bfc..f1c708720ccf 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GenericDomTreeConstruction.h" @@ -275,14 +276,37 @@ void VPRegionBlock::execute(VPTransformState *State) { } void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(InsertPos->getParent() && + "Insertion position not in any VPBasicBlock"); Parent = InsertPos->getParent(); Parent->getRecipeList().insert(InsertPos->getIterator(), this); } +void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { + assert(!Parent && "Recipe already in some VPBasicBlock"); + assert(InsertPos->getParent() && + "Insertion position not in any VPBasicBlock"); + Parent = InsertPos->getParent(); + Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this); +} + +void VPRecipeBase::removeFromParent() { + assert(getParent() && "Recipe not in any VPBasicBlock"); + getParent()->getRecipeList().remove(getIterator()); + Parent = nullptr; +} + iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { + assert(getParent() && "Recipe not in any VPBasicBlock"); return getParent()->getRecipeList().erase(getIterator()); } +void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { + removeFromParent(); + insertAfter(InsertPos); +} + void VPInstruction::generateInstruction(VPTransformState &State, unsigned Part) { IRBuilder<> &Builder = State.Builder; @@ -309,6 +333,14 @@ void VPInstruction::generateInstruction(VPTransformState &State, State.set(this, V, Part); break; } + case Instruction::Select: { + Value *Cond = State.get(getOperand(0), Part); + Value *Op1 = State.get(getOperand(1), Part); + Value *Op2 = State.get(getOperand(2), Part); + Value *V = Builder.CreateSelect(Cond, Op1, Op2); + State.set(this, V, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -433,14 +465,20 @@ void VPlan::execute(VPTransformState *State) { // We do not attempt to preserve DT for outer loop vectorization currently. if (!EnableVPlanNativePath) - updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB); + updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB, + L->getExitBlock()); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void VPlan::dump() const { dbgs() << *this << '\n'; } +#endif + void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB) { + BasicBlock *LoopLatchBB, + BasicBlock *LoopExitBB) { BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor(); assert(LoopHeaderBB && "Loop preheader does not have a single successor."); - DT->addNewBlock(LoopHeaderBB, LoopPreHeaderBB); // The vector body may be more than a single basic-block by this point. // Update the dominator tree information inside the vector body by propagating // it from header to latch, expecting only triangular control-flow, if any. @@ -471,6 +509,9 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, DT->addNewBlock(InterimSucc, BB); DT->addNewBlock(PostDomSucc, BB); } + // Latch block is a new dominator for the loop exit. + DT->changeImmediateDominator(LoopExitBB, LoopLatchBB); + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { @@ -495,8 +536,7 @@ void VPlanPrinter::dump() { if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) { OS << ", where:"; if (Plan.BackedgeTakenCount) - OS << "\\n" - << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount"; + OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount"; for (auto Entry : Plan.Value2VPValue) { OS << "\\n" << *Entry.second; OS << DOT::EscapeString(" := "); @@ -508,7 +548,7 @@ void VPlanPrinter::dump() { OS << "edge [fontname=Courier, fontsize=30]\n"; OS << "compound=true\n"; - for (VPBlockBase *Block : depth_first(Plan.getEntry())) + for (const VPBlockBase *Block : depth_first(Plan.getEntry())) dumpBlock(Block); OS << "}\n"; @@ -647,6 +687,16 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, O << " " << VPlanIngredient(IV) << "\\l\""; } +void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const { + O << " +\n" << Indent << "\"WIDEN-GEP "; + O << (IsPtrLoopInvariant ? "Inv" : "Var"); + size_t IndicesNumber = IsIndexLoopInvariant.size(); + for (size_t I = 0; I < IndicesNumber; ++I) + O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]"; + O << "\\l\""; + O << " +\n" << Indent << "\" " << VPlanIngredient(GEP) << "\\l\""; +} + void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\""; } @@ -689,9 +739,12 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const { void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent) const { O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr); - if (User) { + O << ", "; + getAddr()->printAsOperand(O); + VPValue *Mask = getMask(); + if (Mask) { O << ", "; - User->getOperand(0)->printAsOperand(O); + Mask->printAsOperand(O); } O << "\\l\""; } @@ -728,7 +781,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, auto NewIGIter = Old2New.find(IG); if (NewIGIter == Old2New.end()) Old2New[IG] = new InterleaveGroup<VPInstruction>( - IG->getFactor(), IG->isReverse(), IG->getAlignment()); + IG->getFactor(), IG->isReverse(), Align(IG->getAlignment())); if (Inst == IG->getInsertPos()) Old2New[IG]->setInsertPos(VPInst); @@ -736,7 +789,8 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, InterleaveGroupMap[VPInst] = Old2New[IG]; InterleaveGroupMap[VPInst]->insertMember( VPInst, IG->getIndex(Inst), - IG->isReverse() ? (-1) * int(IG->getFactor()) : IG->getFactor()); + Align(IG->isReverse() ? (-1) * int(IG->getFactor()) + : IG->getFactor())); } } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block)) visitRegion(Region, Old2New, IAI); diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h index 8a06412ad590..c65abc3639d7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlan.h @@ -31,6 +31,7 @@ #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" @@ -226,6 +227,8 @@ public: struct VPCallback { virtual ~VPCallback() {} virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; + virtual Value *getOrCreateScalarValue(Value *V, + const VPIteration &Instance) = 0; }; /// VPTransformState holds information passed down when "executing" a VPlan, @@ -268,6 +271,13 @@ struct VPTransformState { return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); } + /// Get the generated Value for a given VPValue and given Part and Lane. Note + /// that as per-lane Defs are still created by ILV and managed in its ValueMap + /// this method currently just delegates the call to ILV. + Value *get(VPValue *Def, const VPIteration &Instance) { + return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance); + } + /// Set the generated Value for a given VPValue and a given Part. void set(VPValue *Def, Value *V, unsigned Part) { if (!Data.PerPartOutput.count(Def)) { @@ -567,6 +577,7 @@ public: /// instructions. class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> { friend VPBasicBlock; + friend class VPBlockUtils; private: const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -586,6 +597,7 @@ public: VPInterleaveSC, VPPredInstPHISC, VPReplicateSC, + VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, VPWidenPHISC, @@ -615,6 +627,18 @@ public: /// the specified recipe. void insertBefore(VPRecipeBase *InsertPos); + /// Insert an unlinked Recipe into a basic block immediately after + /// the specified Recipe. + void insertAfter(VPRecipeBase *InsertPos); + + /// Unlink this recipe from its current VPBasicBlock and insert it into + /// the VPBasicBlock that MovePos lives in, right after MovePos. + void moveAfter(VPRecipeBase *MovePos); + + /// This method unlinks 'this' from the containing basic block, but does not + /// delete it. + void removeFromParent(); + /// This method unlinks 'this' from the containing basic block and deletes it. /// /// \returns an iterator pointing to the element after the erased one @@ -626,7 +650,6 @@ public: /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPUser, public VPRecipeBase { - friend class VPlanHCFGTransforms; friend class VPlanSlp; public: @@ -736,6 +759,36 @@ public: void print(raw_ostream &O, const Twine &Indent) const override; }; +/// A recipe for handling GEP instructions. +class VPWidenGEPRecipe : public VPRecipeBase { +private: + GetElementPtrInst *GEP; + bool IsPtrLoopInvariant; + SmallBitVector IsIndexLoopInvariant; + +public: + VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop) + : VPRecipeBase(VPWidenGEPSC), GEP(GEP), + IsIndexLoopInvariant(GEP->getNumIndices(), false) { + IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); + for (auto Index : enumerate(GEP->indices())) + IsIndexLoopInvariant[Index.index()] = + OrigLoop->isLoopInvariant(Index.value().get()); + } + ~VPWidenGEPRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenGEPSC; + } + + /// Generate the gep nodes. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent) const override; +}; + /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { @@ -818,13 +871,14 @@ public: class VPInterleaveRecipe : public VPRecipeBase { private: const InterleaveGroup<Instruction> *IG; - std::unique_ptr<VPUser> User; + VPUser User; public: - VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), IG(IG) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); + VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, + VPValue *Mask) + : VPRecipeBase(VPInterleaveSC), IG(IG), User({Addr}) { + if (Mask) + User.addOperand(Mask); } ~VPInterleaveRecipe() override = default; @@ -833,6 +887,18 @@ public: return V->getVPRecipeID() == VPRecipeBase::VPInterleaveSC; } + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return User.getOperand(0); // Address is the 1st, mandatory operand. + } + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + // Mask is optional and therefore the last, currently 2nd operand. + return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + } + /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; @@ -955,13 +1021,14 @@ public: class VPWidenMemoryInstructionRecipe : public VPRecipeBase { private: Instruction &Instr; - std::unique_ptr<VPUser> User; + VPUser User; public: - VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) { - if (Mask) // Create a VPInstruction to register as a user of the mask. - User.reset(new VPUser({Mask})); + VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr, + VPValue *Mask) + : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) { + if (Mask) + User.addOperand(Mask); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -969,6 +1036,18 @@ public: return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC; } + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return User.getOperand(0); // Address is the 1st, mandatory operand. + } + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + // Mask is optional and therefore the last, currently 2nd operand. + return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr; + } + /// Generate the wide load/store. void execute(VPTransformState &State) override; @@ -1139,6 +1218,128 @@ public: void execute(struct VPTransformState *State) override; }; +//===----------------------------------------------------------------------===// +// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs // +//===----------------------------------------------------------------------===// + +// The following set of template specializations implement GraphTraits to treat +// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note +// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the +// VPBlockBase is a VPRegionBlock, this specialization provides access to its +// successors/predecessors but not to the blocks inside the region. + +template <> struct GraphTraits<VPBlockBase *> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +template <> struct GraphTraits<const VPBlockBase *> { + using NodeRef = const VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getSuccessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getSuccessors().end(); + } +}; + +// Inverse order specialization for VPBasicBlocks. Predecessors are used instead +// of successors for the inverse traversal. +template <> struct GraphTraits<Inverse<VPBlockBase *>> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; + + static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; } + + static inline ChildIteratorType child_begin(NodeRef N) { + return N->getPredecessors().begin(); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return N->getPredecessors().end(); + } +}; + +// The following set of template specializations implement GraphTraits to +// treat VPRegionBlock as a graph and recurse inside its nodes. It's important +// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases +// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so +// there won't be automatic recursion into other VPBlockBases that turn to be +// VPRegionBlocks. + +template <> +struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> { + using GraphRef = VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getEntry()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + +template <> +struct GraphTraits<const VPRegionBlock *> + : public GraphTraits<const VPBlockBase *> { + using GraphRef = const VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getEntry()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + +template <> +struct GraphTraits<Inverse<VPRegionBlock *>> + : public GraphTraits<Inverse<VPBlockBase *>> { + using GraphRef = VPRegionBlock *; + using nodes_iterator = df_iterator<NodeRef>; + + static NodeRef getEntryNode(Inverse<GraphRef> N) { + return N.Graph->getExit(); + } + + static nodes_iterator nodes_begin(GraphRef N) { + return nodes_iterator::begin(N->getExit()); + } + + static nodes_iterator nodes_end(GraphRef N) { + // df_iterator::end() returns an empty iterator so the node used doesn't + // matter. + return nodes_iterator::end(N); + } +}; + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a @@ -1241,35 +1442,45 @@ public: return Value2VPValue[V]; } + VPValue *getOrAddVPValue(Value *V) { + assert(V && "Trying to get or add the VPValue of a null Value"); + if (!Value2VPValue.count(V)) + addVPValue(V); + return getVPValue(V); + } + /// Return the VPLoopInfo analysis for this VPlan. VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } + /// Dump the plan to stderr (for debugging). + void dump() const; + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. - static void updateDominatorTree(DominatorTree *DT, + static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB, BasicBlock *LoopPreHeaderBB, - BasicBlock *LoopLatchBB); + BasicBlock *LoopExitBB); }; /// VPlanPrinter prints a given VPlan to a given output stream. The printing is /// indented and follows the dot format. class VPlanPrinter { - friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan); + friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan); friend inline raw_ostream &operator<<(raw_ostream &OS, const struct VPlanIngredient &I); private: raw_ostream &OS; - VPlan &Plan; - unsigned Depth; + const VPlan &Plan; + unsigned Depth = 0; unsigned TabWidth = 2; std::string Indent; unsigned BID = 0; SmallDenseMap<const VPBlockBase *, unsigned> BlockID; - VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {} + VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {} /// Handle indentation. void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } @@ -1316,135 +1527,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { return OS; } -inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) { +inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { VPlanPrinter Printer(OS, Plan); Printer.dump(); return OS; } //===----------------------------------------------------------------------===// -// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs // -//===----------------------------------------------------------------------===// - -// The following set of template specializations implement GraphTraits to treat -// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note -// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the -// VPBlockBase is a VPRegionBlock, this specialization provides access to its -// successors/predecessors but not to the blocks inside the region. - -template <> struct GraphTraits<VPBlockBase *> { - using NodeRef = VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getSuccessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getSuccessors().end(); - } -}; - -template <> struct GraphTraits<const VPBlockBase *> { - using NodeRef = const VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator; - - static NodeRef getEntryNode(NodeRef N) { return N; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getSuccessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getSuccessors().end(); - } -}; - -// Inverse order specialization for VPBasicBlocks. Predecessors are used instead -// of successors for the inverse traversal. -template <> struct GraphTraits<Inverse<VPBlockBase *>> { - using NodeRef = VPBlockBase *; - using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator; - - static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; } - - static inline ChildIteratorType child_begin(NodeRef N) { - return N->getPredecessors().begin(); - } - - static inline ChildIteratorType child_end(NodeRef N) { - return N->getPredecessors().end(); - } -}; - -// The following set of template specializations implement GraphTraits to -// treat VPRegionBlock as a graph and recurse inside its nodes. It's important -// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases -// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so -// there won't be automatic recursion into other VPBlockBases that turn to be -// VPRegionBlocks. - -template <> -struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> { - using GraphRef = VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getEntry()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -template <> -struct GraphTraits<const VPRegionBlock *> - : public GraphTraits<const VPBlockBase *> { - using GraphRef = const VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getEntry()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -template <> -struct GraphTraits<Inverse<VPRegionBlock *>> - : public GraphTraits<Inverse<VPBlockBase *>> { - using GraphRef = VPRegionBlock *; - using nodes_iterator = df_iterator<NodeRef>; - - static NodeRef getEntryNode(Inverse<GraphRef> N) { - return N.Graph->getExit(); - } - - static nodes_iterator nodes_begin(GraphRef N) { - return nodes_iterator::begin(N->getExit()); - } - - static nodes_iterator nodes_end(GraphRef N) { - // df_iterator::end() returns an empty iterator so the node used doesn't - // matter. - return nodes_iterator::end(N); - } -}; - -//===----------------------------------------------------------------------===// // VPlan Utilities //===----------------------------------------------------------------------===// diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index e5ab24e52df6..9019ed15ec5f 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -346,11 +346,14 @@ SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { dbgs() << " Ops: "; - for (auto Op : Values) - if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr()) - dbgs() << *Instr << " | "; - else - dbgs() << " nullptr | "; + for (auto Op : Values) { + if (auto *VPInstr = cast_or_null<VPInstruction>(Op)) + if (auto *Instr = VPInstr->getUnderlyingInstr()) { + dbgs() << *Instr << " | "; + continue; + } + dbgs() << " nullptr | "; + } dbgs() << "\n"; } diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 7ed7d21b6caa..3f6a2efd55cc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1,4 +1,4 @@ -//===-- VPlanHCFGTransforms.cpp - Utility VPlan to VPlan transforms -------===// +//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,17 +11,17 @@ /// //===----------------------------------------------------------------------===// -#include "VPlanHCFGTransforms.h" +#include "VPlanTransforms.h" #include "llvm/ADT/PostOrderIterator.h" using namespace llvm; -void VPlanHCFGTransforms::VPInstructionsToVPRecipes( - VPlanPtr &Plan, +void VPlanTransforms::VPInstructionsToVPRecipes( + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions) { - VPRegionBlock *TopRegion = dyn_cast<VPRegionBlock>(Plan->getEntry()); + auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry()); ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry()); // Condition bit VPValues get deleted during transformation to VPRecipes. @@ -56,7 +56,9 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes( VPRecipeBase *NewRecipe = nullptr; // Create VPWidenMemoryInstructionRecipe for loads and stores. if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst)) - NewRecipe = new VPWidenMemoryInstructionRecipe(*Inst, nullptr /*Mask*/); + NewRecipe = new VPWidenMemoryInstructionRecipe( + *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), + nullptr /*Mask*/); else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) { InductionDescriptor II = Inductions->lookup(Phi); if (II.getKind() == InductionDescriptor::IK_IntInduction || @@ -64,6 +66,8 @@ void VPlanHCFGTransforms::VPInstructionsToVPRecipes( NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi); } else NewRecipe = new VPWidenPHIRecipe(Phi); + } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) { + NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop); } else { // If the last recipe is a VPWidenRecipe, add Inst to it instead of // creating a new recipe. diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 79a23c33184f..0d3bd7da09a7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanHCFGTransforms.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -1,4 +1,4 @@ -//===- VPlanHCFGTransforms.h - Utility VPlan to VPlan transforms ----------===// +//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,8 +10,8 @@ /// This file provides utility VPlan to VPlan transformations. //===----------------------------------------------------------------------===// -#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H -#define LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H #include "VPlan.h" #include "llvm/IR/Instruction.h" @@ -19,17 +19,17 @@ namespace llvm { -class VPlanHCFGTransforms { +class VPlanTransforms { public: /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. static void VPInstructionsToVPRecipes( - VPlanPtr &Plan, + Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList *Inductions, SmallPtrSetImpl<Instruction *> &DeadInstructions); }; } // namespace llvm -#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANHCFGTRANSFORMS_H +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h index 7b6c228c229e..464498c29d89 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -37,7 +37,7 @@ class VPUser; // and live-outs which the VPlan will need to fix accordingly. class VPValue { friend class VPBuilder; - friend class VPlanHCFGTransforms; + friend class VPlanTransforms; friend class VPBasicBlock; friend class VPInterleavedAccessInfo; diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 394b1b93113b..ab3e7e2282e7 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -14,6 +14,7 @@ #include "VPlanVerifier.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Support/CommandLine.h" #define DEBUG_TYPE "loop-vectorize" |
