src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2020-07-26 19:36:28 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2020-07-26 19:36:28 +0000
commit	cfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree	209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/CodeGen/CodeGenPrepare.cpp
parent	706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)

vendor/llvm-project/llvmorg-11-init-20887-g2e10b7a39b9 vendor/llvm-project/master

Notes

Diffstat (limited to 'llvm/lib/CodeGen/CodeGenPrepare.cpp')

-rw-r--r--

llvm/lib/CodeGen/CodeGenPrepare.cpp

915

1 files changed, 668 insertions, 247 deletions

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index f05afd058746..e8b8e6c93cf0 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp

@@ -43,7 +43,6 @@

#include "llvm/IR/Argument.h"

#include "llvm/IR/Attributes.h"

#include "llvm/IR/BasicBlock.h"

-#include "llvm/IR/CallSite.h"

#include "llvm/IR/Constant.h"

#include "llvm/IR/Constants.h"

#include "llvm/IR/DataLayout.h"

@@ -61,7 +60,6 @@

#include "llvm/IR/IntrinsicInst.h"

#include "llvm/IR/Intrinsics.h"

#include "llvm/IR/IntrinsicsAArch64.h"

-#include "llvm/IR/IntrinsicsX86.h"

#include "llvm/IR/LLVMContext.h"

#include "llvm/IR/MDBuilder.h"

#include "llvm/IR/Module.h"

@@ -178,6 +176,17 @@ static cl::opt<bool> ProfileGuidedSectionPrefix(

"profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,

cl::desc("Use profile info to add section prefix for hot/cold functions"));

+static cl::opt<bool> ProfileUnknownInSpecialSection(

+ "profile-unknown-in-special-section", cl::Hidden, cl::init(false),

+ cl::ZeroOrMore,

+ cl::desc("In profiling mode like sampleFDO, if a function doesn't have "

+ "profile, we cannot tell the function is cold for sure because "

+ "it may be a function newly added without ever being sampled. "

+ "With the flag enabled, compiler can put such profile unknown "

+ "functions into a special section, so runtime system can choose "

+ "to handle it in a different way than .text section, to save "

+ "RAM for example. "));

static cl::opt<unsigned> FreqRatioToSkipMerge(

"cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),

cl::desc("Skip merging empty blocks if (frequency of empty block) / "

@@ -230,6 +239,15 @@ static cl::opt<bool> EnableICMP_EQToICMP_ST(

"cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false),

cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));

+static cl::opt<bool>

+ VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false),

+ cl::desc("Enable BFI update verification for "

+ "CodeGenPrepare."));

+static cl::opt<bool> OptimizePhiTypes(

+ "cgp-optimize-phi-types", cl::Hidden, cl::init(false),

+ cl::desc("Enable converting phi types in CodeGenPrepare"));

namespace {

enum ExtType {

@@ -327,6 +345,7 @@ class TypePromotionTransaction;

// FIXME: When we can selectively preserve passes, preserve the domtree.

AU.addRequired<ProfileSummaryInfoWrapperPass>();

AU.addRequired<TargetLibraryInfoWrapperPass>();

+ AU.addRequired<TargetPassConfig>();

AU.addRequired<TargetTransformInfoWrapperPass>();

AU.addRequired<LoopInfoWrapperPass>();

}

@@ -368,12 +387,14 @@ class TypePromotionTransaction;

bool optimizeInst(Instruction *I, bool &ModifiedDT);

bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

Type *AccessTy, unsigned AddrSpace);

+ bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);

bool optimizeInlineAsmInst(CallInst *CS);

bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);

bool optimizeExt(Instruction *&I);

bool optimizeExtUses(Instruction *I);

bool optimizeLoadExt(LoadInst *Load);

bool optimizeShiftInst(BinaryOperator *BO);

+ bool optimizeFunnelShift(IntrinsicInst *Fsh);

bool optimizeSelectInst(SelectInst *SI);

bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);

bool optimizeSwitchInst(SwitchInst *SI);

@@ -389,20 +410,25 @@ class TypePromotionTransaction;

unsigned CreatedInstsCost = 0);

bool mergeSExts(Function &F);

bool splitLargeGEPOffsets();

+ bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,

+ SmallPtrSetImpl<Instruction *> &DeletedInstrs);

+ bool optimizePhiTypes(Function &F);

bool performAddressTypePromotion(

Instruction *&Inst,

bool AllowPromotionWithoutCommonHeader,

bool HasPromoted, TypePromotionTransaction &TPT,

SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);

bool splitBranchCondition(Function &F, bool &ModifiedDT);

- bool simplifyOffsetableRelocate(Instruction &I);

+ bool simplifyOffsetableRelocate(GCStatepointInst &I);

bool tryToSinkFreeOperands(Instruction *I);

- bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,

+ bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0,

+ Value *Arg1, CmpInst *Cmp,

Intrinsic::ID IID);

bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);

bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);

bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);

+ void verifyBFIUpdates(Function &F);

};

} // end anonymous namespace

@@ -428,12 +454,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {

InsertedInsts.clear();

PromotedInsts.clear();

- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {

- TM = &TPC->getTM<TargetMachine>();

- SubtargetInfo = TM->getSubtargetImpl(F);

- TLI = SubtargetInfo->getTargetLowering();

- TRI = SubtargetInfo->getRegisterInfo();

- }

+ TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();

+ SubtargetInfo = TM->getSubtargetImpl(F);

+ TLI = SubtargetInfo->getTargetLowering();

+ TRI = SubtargetInfo->getRegisterInfo();

TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);

TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);

LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();

@@ -446,14 +470,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) {

F.setSectionPrefix(".hot");

else if (PSI->isFunctionColdInCallGraph(&F, *BFI))

F.setSectionPrefix(".unlikely");

+ else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&

+ PSI->isFunctionHotnessUnknown(F))

+ F.setSectionPrefix(".unknown");

}

/// This optimization identifies DIV instructions that can be

/// profitably bypassed and carried out with a shorter, faster divide.

- if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI &&

- TLI->isSlowDivBypassed()) {

+ if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {

const DenseMap<unsigned int, unsigned int> &BypassWidths =

- TLI->getBypassSlowDivWidths();

+ TLI->getBypassSlowDivWidths();

BasicBlock* BB = &*F.begin();

while (BB != nullptr) {

// bypassSlowDivision may create new BBs, but we don't want to reapply the

@@ -495,6 +521,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {

MadeChange |= mergeSExts(F);

if (!LargeOffsetGEPMap.empty())

MadeChange |= splitLargeGEPOffsets();

+ MadeChange |= optimizePhiTypes(F);

+ if (MadeChange)

+ eliminateFallThrough(F);

// Really free removed instructions during promotion.

for (Instruction *I : RemovedInsts)

@@ -550,11 +580,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {

}

if (!DisableGCOpts) {

- SmallVector<Instruction *, 2> Statepoints;

+ SmallVector<GCStatepointInst *, 2> Statepoints;

for (BasicBlock &BB : F)

for (Instruction &I : BB)

- if (isStatepoint(I))

- Statepoints.push_back(&I);

+ if (auto *SP = dyn_cast<GCStatepointInst>(&I))

+ Statepoints.push_back(SP);

for (auto &I : Statepoints)

EverMadeChange |= simplifyOffsetableRelocate(*I);

}

@@ -563,9 +593,23 @@ bool CodeGenPrepare::runOnFunction(Function &F) {

// preparatory transforms.

EverMadeChange |= placeDbgValues(F);

+#ifndef NDEBUG

+ if (VerifyBFIUpdates)

+ verifyBFIUpdates(F);

+#endif

return EverMadeChange;

}

+// Verify BFI has been updated correctly by recomputing BFI and comparing them.

+void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {

+ DominatorTree NewDT(F);

+ LoopInfo NewLI(NewDT);

+ BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);

+ BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);

+ NewBFI.verifyMatch(*BFI);

/// Merge basic blocks which are connected by a single edge, where one of the

/// basic blocks has a single successor pointing to the other basic block,

/// which has a single predecessor.

@@ -749,7 +793,7 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,

BlockFrequency PredFreq = BFI->getBlockFreq(Pred);

BlockFrequency BBFreq = BFI->getBlockFreq(BB);

- for (auto SameValueBB : SameIncomingValueBBs)

+ for (auto *SameValueBB : SameIncomingValueBBs)

if (SameValueBB->getUniquePredecessor() == Pred &&

DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))

BBFreq += BFI->getBlockFreq(SameValueBB);

@@ -925,7 +969,7 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,

SmallVectorImpl<Value *> &OffsetV) {

for (unsigned i = 1; i < GEP->getNumOperands(); i++) {

// Only accept small constant integer operands

- auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));

+ auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i));

if (!Op || Op->getZExtValue() > 20)

return false;

}

@@ -949,7 +993,7 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,

// be skipped by optimization and we do not care about them.

for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();

&*R != RelocatedBase; ++R)

- if (auto RI = dyn_cast<GCRelocateInst>(R))

+ if (auto *RI = dyn_cast<GCRelocateInst>(R))

if (RI->getStatepoint() == RelocatedBase->getStatepoint())

if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {

RelocatedBase->moveBefore(RI);

@@ -973,7 +1017,7 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,

}

Value *Base = ToReplace->getBasePtr();

- auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());

+ auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());

if (!Derived || Derived->getPointerOperand() != Base)

continue;

@@ -1050,10 +1094,9 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,

// %base' = gc.relocate(%tok, i32 4, i32 4)

// %ptr' = gep %base' + 15

// %val = load %ptr'

-bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {

+bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {

bool MadeChange = false;

SmallVector<GCRelocateInst *, 2> AllRelocateCalls;

for (auto *U : I.users())

if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))

// Collect all the relocate calls associated with a statepoint

@@ -1187,6 +1230,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,

}

bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,

+ Value *Arg0, Value *Arg1,

CmpInst *Cmp,

Intrinsic::ID IID) {

if (BO->getParent() != Cmp->getParent()) {

@@ -1204,8 +1248,6 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,

}

// We allow matching the canonical IR (add X, C) back to (usubo X, -C).

- Value *Arg0 = BO->getOperand(0);

- Value *Arg1 = BO->getOperand(1);

if (BO->getOpcode() == Instruction::Add &&

IID == Intrinsic::usub_with_overflow) {

assert(isa<Constant>(Arg1) && "Unexpected input for usubo");

@@ -1215,7 +1257,9 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,

// Insert at the first instruction of the pair.

Instruction *InsertPt = nullptr;

for (Instruction &Iter : *Cmp->getParent()) {

- if (&Iter == BO || &Iter == Cmp) {

+ // If BO is an XOR, it is not guaranteed that it comes after both inputs to

+ // the overflow intrinsic are defined.

+ if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {

InsertPt = &Iter;

break;

}

@@ -1224,12 +1268,16 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,

IRBuilder<> Builder(InsertPt);

Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);

- Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");

+ if (BO->getOpcode() != Instruction::Xor) {

+ Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");

+ BO->replaceAllUsesWith(Math);

+ } else

+ assert(BO->hasOneUse() &&

+ "Patterns with XOr should use the BO only in the compare");

Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");

- BO->replaceAllUsesWith(Math);

Cmp->replaceAllUsesWith(OV);

- BO->eraseFromParent();

Cmp->eraseFromParent();

+ BO->eraseFromParent();

return true;

}

@@ -1269,12 +1317,17 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,

bool &ModifiedDT) {

Value *A, *B;

BinaryOperator *Add;

- if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))

+ if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {

if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))

return false;

+ // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.

+ A = Add->getOperand(0);

+ B = Add->getOperand(1);

+ }

if (!TLI->shouldFormOverflowOp(ISD::UADDO,

- TLI->getValueType(*DL, Add->getType())))

+ TLI->getValueType(*DL, Add->getType()),

+ Add->hasNUsesOrMore(2)))

return false;

// We don't want to move around uses of condition values this late, so we

@@ -1283,7 +1336,8 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,

if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())

return false;

- if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))

+ if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,

+ Intrinsic::uadd_with_overflow))

return false;

// Reset callers - do not crash by iterating over a dead instruction.

@@ -1341,10 +1395,12 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,

return false;

if (!TLI->shouldFormOverflowOp(ISD::USUBO,

- TLI->getValueType(*DL, Sub->getType())))

+ TLI->getValueType(*DL, Sub->getType()),

+ Sub->hasNUsesOrMore(2)))

return false;

- if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))

+ if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),

+ Cmp, Intrinsic::usub_with_overflow))

return false;

// Reset callers - do not crash by iterating over a dead instruction.

@@ -1813,9 +1869,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,

const TargetLowering *TLI,

const DataLayout *DL,

bool &ModifiedDT) {

- if (!TLI || !DL)

- return false;

// If a zero input is undefined, it doesn't make sense to despeculate that.

if (match(CountZeros->getOperand(1), m_One()))

return false;

@@ -1877,7 +1930,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

// Lower inline assembly if we can.

// If we found an inline asm expession, and if the target knows how to

// lower it to normal LLVM code, do so now.

- if (TLI && isa<InlineAsm>(CI->getCalledValue())) {

+ if (CI->isInlineAsm()) {

if (TLI->ExpandInlineAsm(CI)) {

// Avoid invalidating the iterator.

CurInstIterator = BB->begin();

@@ -1894,7 +1947,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

// Align the pointer arguments to this call if the target thinks it's a good

// idea

unsigned MinSize, PrefAlign;

- if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {

+ if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {

for (auto &Arg : CI->arg_operands()) {

// We want to align both objects whose address is used directly and

// objects whose address is used in casts and GEPs, though it only makes

@@ -1912,7 +1965,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

AllocaInst *AI;

if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&

DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)

- AI->setAlignment(MaybeAlign(PrefAlign));

+ AI->setAlignment(Align(PrefAlign));

// Global variables can only be aligned if they are defined in this

// object (i.e. they are uniquely initialized in this object), and

// over-aligning global variables that have an explicit section is

@@ -1927,12 +1980,14 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

// If this is a memcpy (or similar) then we may be able to improve the

// alignment

if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {

- unsigned DestAlign = getKnownAlignment(MI->getDest(), *DL);

- if (DestAlign > MI->getDestAlignment())

+ Align DestAlign = getKnownAlignment(MI->getDest(), *DL);

+ MaybeAlign MIDestAlign = MI->getDestAlign();

+ if (!MIDestAlign || DestAlign > *MIDestAlign)

MI->setDestAlignment(DestAlign);

if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {

- unsigned SrcAlign = getKnownAlignment(MTI->getSource(), *DL);

- if (SrcAlign > MTI->getSourceAlignment())

+ MaybeAlign MTISrcAlign = MTI->getSourceAlign();

+ Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);

+ if (!MTISrcAlign || SrcAlign > *MTISrcAlign)

MTI->setSourceAlignment(SrcAlign);

}

@@ -1942,8 +1997,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

// cold block. This interacts with our handling for loads and stores to

// ensure that we can fold all uses of a potential addressing computation

// into their uses. TODO: generalize this to work over profiling data

- bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get());

- if (!OptForSize && CI->hasFnAttr(Attribute::Cold))

+ if (CI->hasFnAttr(Attribute::Cold) &&

+ !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))

for (auto &Arg : CI->arg_operands()) {

if (!Arg->getType()->isPointerTy())

continue;

@@ -1955,10 +2010,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

if (II) {

switch (II->getIntrinsicID()) {

default: break;

+ case Intrinsic::assume: {

+ II->eraseFromParent();

+ return true;

+ }

case Intrinsic::experimental_widenable_condition: {

// Give up on future widening oppurtunties so that we can fold away dead

// paths and merge blocks before going into block-local instruction

- // selection.

+ // selection.

if (II->use_empty()) {

II->eraseFromParent();

return true;

@@ -2008,21 +2068,43 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

case Intrinsic::ctlz:

// If counting zeros is expensive, try to avoid it.

return despeculateCountZeros(II, TLI, DL, ModifiedDT);

+ case Intrinsic::fshl:

+ case Intrinsic::fshr:

+ return optimizeFunnelShift(II);

case Intrinsic::dbg_value:

return fixupDbgValue(II);

+ case Intrinsic::vscale: {

+ // If datalayout has no special restrictions on vector data layout,

+ // replace `llvm.vscale` by an equivalent constant expression

+ // to benefit from cheap constant propagation.

+ Type *ScalableVectorTy =

+ VectorType::get(Type::getInt8Ty(II->getContext()), 1, true);

+ if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) {

+ auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo());

+ auto *One = ConstantInt::getSigned(II->getType(), 1);

+ auto *CGep =

+ ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One);

+ II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType()));

+ II->eraseFromParent();

+ return true;

+ }

+ break;

}

- if (TLI) {

- SmallVector<Value*, 2> PtrOps;

- Type *AccessTy;

- if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))

- while (!PtrOps.empty()) {

- Value *PtrVal = PtrOps.pop_back_val();

- unsigned AS = PtrVal->getType()->getPointerAddressSpace();

- if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))

- return true;

- }

+ case Intrinsic::masked_gather:

+ return optimizeGatherScatterInst(II, II->getArgOperand(0));

+ case Intrinsic::masked_scatter:

+ return optimizeGatherScatterInst(II, II->getArgOperand(1));

}

+ SmallVector<Value *, 2> PtrOps;

+ Type *AccessTy;

+ if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))

+ while (!PtrOps.empty()) {

+ Value *PtrVal = PtrOps.pop_back_val();

+ unsigned AS = PtrVal->getType()->getPointerAddressSpace();

+ if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))

+ return true;

+ }

}

// From here on out we're working with named functions.

@@ -2033,7 +2115,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

// to fortified library functions (e.g. __memcpy_chk) that have the default

// "don't know" as the objectsize. Anything else should be left alone.

FortifiedLibCallSimplifier Simplifier(TLInfo, true);

- if (Value *V = Simplifier.optimizeCall(CI)) {

+ IRBuilder<> Builder(CI);

+ if (Value *V = Simplifier.optimizeCall(CI, Builder)) {

CI->replaceAllUsesWith(V);

CI->eraseFromParent();

return true;

@@ -2073,14 +2156,12 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {

/// ret i32 %tmp2

/// @endcode

bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) {

- if (!TLI)

- return false;

ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());

if (!RetI)

return false;

PHINode *PN = nullptr;

+ ExtractValueInst *EVI = nullptr;

BitCastInst *BCI = nullptr;

Value *V = RetI->getReturnValue();

if (V) {

@@ -2088,6 +2169,14 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT

if (BCI)

V = BCI->getOperand(0);

+ EVI = dyn_cast<ExtractValueInst>(V);

+ if (EVI) {

+ V = EVI->getOperand(0);

+ if (!std::all_of(EVI->idx_begin(), EVI->idx_end(),

+ [](unsigned idx) { return idx == 0; }))

+ return false;

+ }

PN = dyn_cast<PHINode>(V);

if (!PN)

return false;

@@ -2101,7 +2190,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT

if (PN) {

BasicBlock::iterator BI = BB->begin();

// Skip over debug and the bitcast.

- do { ++BI; } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI);

+ do {

+ ++BI;

+ } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI);

if (&*BI != RetI)

return false;

} else {

@@ -2157,6 +2248,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT

// Duplicate the return into TailCallBB.

(void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);

+ assert(!VerifyBFIUpdates ||

+ BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));

+ BFI->setBlockFreq(

+ BB,

+ (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());

ModifiedDT = Changed = true;

++NumRetsDup;

}

@@ -2354,6 +2450,9 @@ namespace {

/// This class provides transaction based operation on the IR.

/// Every change made through this class is recorded in the internal state and

/// can be undone (rollback) until commit is called.

+/// CGP does not check if instructions could be speculatively executed when

+/// moved. Preserving the original location would pessimize the debugging

+/// experience, as well as negatively impact the quality of sample PGO.

class TypePromotionTransaction {

/// This represents the common interface of the individual transaction.

/// Each class implements the logic for doing one specific modification on

@@ -2516,6 +2615,7 @@ class TypePromotionTransaction {

/// trunc Opnd to Ty.

TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {

IRBuilder<> Builder(Opnd);

+ Builder.SetCurrentDebugLocation(DebugLoc());

Val = Builder.CreateTrunc(Opnd, Ty, "promoted");

LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");

}

@@ -2568,6 +2668,7 @@ class TypePromotionTransaction {

ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)

: TypePromotionAction(InsertPt) {

IRBuilder<> Builder(InsertPt);

+ Builder.SetCurrentDebugLocation(DebugLoc());

Val = Builder.CreateZExt(Opnd, Ty, "promoted");

LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");

}

@@ -2721,8 +2822,9 @@ public:

TypePromotionTransaction(SetOfInstrs &RemovedInsts)

: RemovedInsts(RemovedInsts) {}

- /// Advocate every changes made in that transaction.

- void commit();

+ /// Advocate every changes made in that transaction. Return true if any change

+ /// happen.

+ bool commit();

/// Undo all the changes made after the given point.

void rollback(ConstRestorationPt Point);

@@ -2828,11 +2930,13 @@ TypePromotionTransaction::getRestorationPoint() const {

return !Actions.empty() ? Actions.back().get() : nullptr;

}

-void TypePromotionTransaction::commit() {

+bool TypePromotionTransaction::commit() {

for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;

++It)

(*It)->commit();

+ bool Modified = !Actions.empty();

Actions.clear();

+ return Modified;

}

void TypePromotionTransaction::rollback(

@@ -3115,7 +3219,7 @@ public:

SmallPtrSet<Value *, 32> Visited;

WorkList.push_back(Val);

while (!WorkList.empty()) {

- auto P = WorkList.pop_back_val();

+ auto *P = WorkList.pop_back_val();

if (!Visited.insert(P).second)

continue;

if (auto *PI = dyn_cast<Instruction>(P))

@@ -3164,13 +3268,13 @@ public:

void destroyNewNodes(Type *CommonType) {

// For safe erasing, replace the uses with dummy value first.

- auto Dummy = UndefValue::get(CommonType);

- for (auto I : AllPhiNodes) {

+ auto *Dummy = UndefValue::get(CommonType);

+ for (auto *I : AllPhiNodes) {

I->replaceAllUsesWith(Dummy);

I->eraseFromParent();

}

AllPhiNodes.clear();

- for (auto I : AllSelectNodes) {

+ for (auto *I : AllSelectNodes) {

I->replaceAllUsesWith(Dummy);

I->eraseFromParent();

}

@@ -3511,7 +3615,7 @@ private:

// Must be a Phi node then.

auto *PHI = cast<PHINode>(V);

// Fill the Phi node with values from predecessors.

- for (auto B : predecessors(PHI->getParent())) {

+ for (auto *B : predecessors(PHI->getParent())) {

Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);

assert(Map.find(PV) != Map.end() && "No predecessor Value!");

PHI->addIncoming(ST.Get(Map[PV]), B);

@@ -3625,10 +3729,11 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,

// X*Scale + C*Scale to addr mode.

ConstantInt *CI = nullptr; Value *AddLHS = nullptr;

if (isa<Instruction>(ScaleReg) && // not a constant expr.

- match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {

+ match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&

+ CI->getValue().isSignedIntN(64)) {

TestAddrMode.InBounds = false;

TestAddrMode.ScaledReg = AddLHS;

- TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;

+ TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;

// If this addressing mode is legal, commit it and remember that we folded

// this instruction.

@@ -3849,7 +3954,7 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,

// We can get through binary operator, if it is legal. In other words, the

// binary operator must have a nuw or nsw flag.

const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);

- if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&

+ if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) &&

((!IsSExt && BinOp->hasNoUnsignedWrap()) ||

(IsSExt && BinOp->hasNoSignedWrap())))

return true;

@@ -4251,15 +4356,20 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,

cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();

ConstantOffset += SL->getElementOffset(Idx);

} else {

- uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());

- if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {

- const APInt &CVal = CI->getValue();

- if (CVal.getMinSignedBits() <= 64) {

- ConstantOffset += CVal.getSExtValue() * TypeSize;

- continue;

+ TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType());

+ if (TS.isNonZero()) {

+ // The optimisations below currently only work for fixed offsets.

+ if (TS.isScalable())

+ return false;

+ int64_t TypeSize = TS.getFixedSize();

+ if (ConstantInt *CI =

+ dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {

+ const APInt &CVal = CI->getValue();

+ if (CVal.getMinSignedBits() <= 64) {

+ ConstantOffset += CVal.getSExtValue() * TypeSize;

+ continue;

+ }

}

- }

- if (TypeSize) { // Scales of zero don't do anything.

// We only allow one variable index at the moment.

if (VariableOperand != -1)

return false;

@@ -4422,11 +4532,13 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {

TypePromotionTransaction::ConstRestorationPt LastKnownGood =

TPT.getRestorationPoint();

if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {

- // Fold in immediates if legal for the target.

- AddrMode.BaseOffs += CI->getSExtValue();

- if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))

- return true;

- AddrMode.BaseOffs -= CI->getSExtValue();

+ if (CI->getValue().isSignedIntN(64)) {

+ // Fold in immediates if legal for the target.

+ AddrMode.BaseOffs += CI->getSExtValue();

+ if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))

+ return true;

+ AddrMode.BaseOffs -= CI->getSExtValue();

+ }

} else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {

// If this is a global variable, try to fold it into the addressing mode.

if (!AddrMode.BaseGV) {

@@ -4502,8 +4614,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,

const TargetRegisterInfo &TRI) {

const Function *F = CI->getFunction();

TargetLowering::AsmOperandInfoVector TargetConstraints =

- TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,

- ImmutableCallSite(CI));

+ TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);

for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {

TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];

@@ -4581,14 +4692,16 @@ static bool FindAllMemoryUses(

}

if (CallInst *CI = dyn_cast<CallInst>(UserI)) {

- // If this is a cold call, we can sink the addressing calculation into

- // the cold path. See optimizeCallInst

- bool OptForSize = OptSize ||

+ if (CI->hasFnAttr(Attribute::Cold)) {

+ // If this is a cold call, we can sink the addressing calculation into

+ // the cold path. See optimizeCallInst

+ bool OptForSize = OptSize ||

llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);

- if (!OptForSize && CI->hasFnAttr(Attribute::Cold))

- continue;

+ if (!OptForSize)

+ continue;

+ }

- InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());

+ InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand());

if (!IA) return true;

// If this is a memory operand, we're cool, otherwise bail out.

@@ -4854,7 +4967,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

TPT.rollback(LastKnownGood);

return false;

}

- TPT.commit();

+ bool Modified = TPT.commit();

// Get the combined AddrMode (or the only AddrMode, if we only had one).

ExtAddrMode AddrMode = AddrModes.getAddrMode();

@@ -4868,7 +4981,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

})) {

LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode

<< "\n");

- return false;

+ return Modified;

}

// Insert this computation right after this user. Since our caller is

@@ -4891,7 +5004,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

if (SunkAddr->getType() != Addr->getType())

SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());

} else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&

- TM && SubtargetInfo->addrSinkUsingGEPs())) {

+ SubtargetInfo->addrSinkUsingGEPs())) {

// By default, we use the GEP-based method when AA is used later. This

// prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.

LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode

@@ -4909,7 +5022,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

// We can't add more than one pointer together, nor can we scale a

// pointer (both of which seem meaningless).

if (ResultPtr || AddrMode.Scale != 1)

- return false;

+ return Modified;

ResultPtr = AddrMode.ScaledReg;

AddrMode.Scale = 0;

@@ -4926,12 +5039,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

Type *ScaledRegTy = AddrMode.ScaledReg->getType();

if (cast<IntegerType>(IntPtrTy)->getBitWidth() >

cast<IntegerType>(ScaledRegTy)->getBitWidth())

- return false;

+ return Modified;

}

if (AddrMode.BaseGV) {

if (ResultPtr)

- return false;

+ return Modified;

ResultPtr = AddrMode.BaseGV;

}

@@ -4955,7 +5068,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

!AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {

SunkAddr = Constant::getNullValue(Addr->getType());

} else if (!ResultPtr) {

- return false;

+ return Modified;

} else {

Type *I8PtrTy =

Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());

@@ -5040,7 +5153,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

(ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||

(AddrMode.BaseGV &&

DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))

- return false;

+ return Modified;

LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode

<< " for " << *MemoryInst << "\n");

@@ -5080,7 +5193,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

Instruction *I = dyn_cast_or_null<Instruction>(Result);

if (I && (Result != AddrMode.BaseReg))

I->eraseFromParent();

- return false;

+ return Modified;

}

if (AddrMode.Scale != 1)

V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),

@@ -5142,6 +5255,119 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,

return true;

}

+/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find

+/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can

+/// only handle a 2 operand GEP in the same basic block or a splat constant

+/// vector. The 2 operands to the GEP must have a scalar pointer and a vector

+/// index.

+///

+/// If the existing GEP has a vector base pointer that is splat, we can look

+/// through the splat to find the scalar pointer. If we can't find a scalar

+/// pointer there's nothing we can do.

+///

+/// If we have a GEP with more than 2 indices where the middle indices are all

+/// zeroes, we can replace it with 2 GEPs where the second has 2 operands.

+///

+/// If the final index isn't a vector or is a splat, we can emit a scalar GEP

+/// followed by a GEP with an all zeroes vector index. This will enable

+/// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a

+/// zero index.

+bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,

+ Value *Ptr) {

+ const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);

+ if (!GEP || !GEP->hasIndices())

+ return false;

+ // If the GEP and the gather/scatter aren't in the same BB, don't optimize.

+ // FIXME: We should support this by sinking the GEP.

+ if (MemoryInst->getParent() != GEP->getParent())

+ return false;

+ SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end());

+ bool RewriteGEP = false;

+ if (Ops[0]->getType()->isVectorTy()) {

+ Ops[0] = const_cast<Value *>(getSplatValue(Ops[0]));

+ if (!Ops[0])

+ return false;

+ RewriteGEP = true;

+ }

+ unsigned FinalIndex = Ops.size() - 1;

+ // Ensure all but the last index is 0.

+ // FIXME: This isn't strictly required. All that's required is that they are

+ // all scalars or splats.

+ for (unsigned i = 1; i < FinalIndex; ++i) {

+ auto *C = dyn_cast<Constant>(Ops[i]);

+ if (!C)

+ return false;

+ if (isa<VectorType>(C->getType()))

+ C = C->getSplatValue();

+ auto *CI = dyn_cast_or_null<ConstantInt>(C);

+ if (!CI || !CI->isZero())

+ return false;

+ // Scalarize the index if needed.

+ Ops[i] = CI;

+ }

+ // Try to scalarize the final index.

+ if (Ops[FinalIndex]->getType()->isVectorTy()) {

+ if (Value *V = const_cast<Value *>(getSplatValue(Ops[FinalIndex]))) {

+ auto *C = dyn_cast<ConstantInt>(V);

+ // Don't scalarize all zeros vector.

+ if (!C || !C->isZero()) {

+ Ops[FinalIndex] = V;

+ RewriteGEP = true;

+ }

+ // If we made any changes or the we have extra operands, we need to generate

+ // new instructions.

+ if (!RewriteGEP && Ops.size() == 2)

+ return false;

+ unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();

+ IRBuilder<> Builder(MemoryInst);

+ Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());

+ Value *NewAddr;

+ // If the final index isn't a vector, emit a scalar GEP containing all ops

+ // and a vector GEP with all zeroes final index.

+ if (!Ops[FinalIndex]->getType()->isVectorTy()) {

+ NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());

+ auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);

+ NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));

+ } else {

+ Value *Base = Ops[0];

+ Value *Index = Ops[FinalIndex];

+ // Create a scalar GEP if there are more than 2 operands.

+ if (Ops.size() != 2) {

+ // Replace the last index with 0.

+ Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);

+ Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());

+ }

+ // Now create the GEP with scalar pointer and vector index.

+ NewAddr = Builder.CreateGEP(Base, Index);

+ }

+ MemoryInst->replaceUsesOfWith(Ptr, NewAddr);

+ // If we have no uses, recursively delete the value and all dead instructions

+ // using it.

+ if (Ptr->use_empty())

+ RecursivelyDeleteTriviallyDeadInstructions(Ptr, TLInfo);

+ return true;

/// If there are any memory operands, use OptimizeMemoryInst to sink their

/// address computing into the block when possible / profitable.

bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {

@@ -5150,7 +5376,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {

const TargetRegisterInfo *TRI =

TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();

TargetLowering::AsmOperandInfoVector TargetConstraints =

- TLI->ParseConstraints(*DL, TRI, CS);

+ TLI->ParseConstraints(*DL, TRI, *CS);

unsigned ArgNo = 0;

for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {

TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];

@@ -5231,7 +5457,7 @@ bool CodeGenPrepare::tryToPromoteExts(

bool Promoted = false;

// Iterate over all the extensions to try to promote them.

- for (auto I : Exts) {

+ for (auto *I : Exts) {

// Early check if we directly have ext(load).

if (isa<LoadInst>(I->getOperand(0))) {

ProfitablyMovedExts.push_back(I);

@@ -5242,7 +5468,7 @@ bool CodeGenPrepare::tryToPromoteExts(

// this check inside the for loop is to catch the case where an extension

// is directly fed by a load because in such case the extension can be moved

// up without any promotion on its operands.

- if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)

+ if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)

return false;

// Get the action to perform the promotion.

@@ -5292,7 +5518,7 @@ bool CodeGenPrepare::tryToPromoteExts(

SmallVector<Instruction *, 2> NewlyMovedExts;

(void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);

bool NewPromoted = false;

- for (auto ExtInst : NewlyMovedExts) {

+ for (auto *ExtInst : NewlyMovedExts) {

Instruction *MovedExt = cast<Instruction>(ExtInst);

Value *ExtOperand = MovedExt->getOperand(0);

// If we have reached to a load, we need this extra profitability check

@@ -5358,9 +5584,9 @@ bool CodeGenPrepare::mergeSExts(Function &F) {

return Changed;

}

-// Spliting large data structures so that the GEPs accessing them can have

+// Splitting large data structures so that the GEPs accessing them can have

// smaller offsets so that they can be sunk to the same blocks as their users.

-// For example, a large struct starting from %base is splitted into two parts

+// For example, a large struct starting from %base is split into two parts

// where the second part starts from %new_base.

// Before:

@@ -5421,7 +5647,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {

int64_t BaseOffset = LargeOffsetGEPs.begin()->second;

Value *NewBaseGEP = nullptr;

- auto LargeOffsetGEP = LargeOffsetGEPs.begin();

+ auto *LargeOffsetGEP = LargeOffsetGEPs.begin();

while (LargeOffsetGEP != LargeOffsetGEPs.end()) {

GetElementPtrInst *GEP = LargeOffsetGEP->first;

int64_t Offset = LargeOffsetGEP->second;

@@ -5435,7 +5661,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {

GEP->getAddressSpace())) {

// We need to create a new base if the offset to the current base is

// too large to fit into the addressing mode. So, a very large struct

- // may be splitted into several parts.

+ // may be split into several parts.

BaseGEP = GEP;

BaseOffset = Offset;

NewBaseGEP = nullptr;

@@ -5506,6 +5732,155 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {

return Changed;

}

+bool CodeGenPrepare::optimizePhiType(

+ PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,

+ SmallPtrSetImpl<Instruction *> &DeletedInstrs) {

+ // We are looking for a collection on interconnected phi nodes that together

+ // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts

+ // are of the same type. Convert the whole set of nodes to the type of the

+ // bitcast.

+ Type *PhiTy = I->getType();

+ Type *ConvertTy = nullptr;

+ if (Visited.count(I) ||

+ (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))

+ return false;

+ SmallVector<Instruction *, 4> Worklist;

+ Worklist.push_back(cast<Instruction>(I));

+ SmallPtrSet<PHINode *, 4> PhiNodes;

+ PhiNodes.insert(I);

+ Visited.insert(I);

+ SmallPtrSet<Instruction *, 4> Defs;

+ SmallPtrSet<Instruction *, 4> Uses;

+ while (!Worklist.empty()) {

+ Instruction *II = Worklist.pop_back_val();

+ if (auto *Phi = dyn_cast<PHINode>(II)) {

+ // Handle Defs, which might also be PHI's

+ for (Value *V : Phi->incoming_values()) {

+ if (auto *OpPhi = dyn_cast<PHINode>(V)) {

+ if (!PhiNodes.count(OpPhi)) {

+ if (Visited.count(OpPhi))

+ return false;

+ PhiNodes.insert(OpPhi);

+ Visited.insert(OpPhi);

+ Worklist.push_back(OpPhi);

+ }

+ } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {

+ if (!Defs.count(OpLoad)) {

+ Defs.insert(OpLoad);

+ Worklist.push_back(OpLoad);

+ }

+ } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {

+ if (!Defs.count(OpEx)) {

+ Defs.insert(OpEx);

+ Worklist.push_back(OpEx);

+ }

+ } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {

+ if (!ConvertTy)

+ ConvertTy = OpBC->getOperand(0)->getType();

+ if (OpBC->getOperand(0)->getType() != ConvertTy)

+ return false;

+ if (!Defs.count(OpBC)) {

+ Defs.insert(OpBC);

+ Worklist.push_back(OpBC);

+ }

+ } else if (!isa<UndefValue>(V))

+ return false;

+ }

+ // Handle uses which might also be phi's

+ for (User *V : II->users()) {

+ if (auto *OpPhi = dyn_cast<PHINode>(V)) {

+ if (!PhiNodes.count(OpPhi)) {

+ if (Visited.count(OpPhi))

+ return false;

+ PhiNodes.insert(OpPhi);

+ Visited.insert(OpPhi);

+ Worklist.push_back(OpPhi);

+ }

+ } else if (auto *OpStore = dyn_cast<StoreInst>(V)) {

+ if (OpStore->getOperand(0) != II)

+ return false;

+ Uses.insert(OpStore);

+ } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {

+ if (!ConvertTy)

+ ConvertTy = OpBC->getType();

+ if (OpBC->getType() != ConvertTy)

+ return false;

+ Uses.insert(OpBC);

+ } else

+ return false;

+ }

+ if (!ConvertTy || !TLI->shouldConvertPhiType(PhiTy, ConvertTy))

+ return false;

+ LLVM_DEBUG(dbgs() << "Converting " << *I << "\n and connected nodes to "

+ << *ConvertTy << "\n");

+ // Create all the new phi nodes of the new type, and bitcast any loads to the

+ // correct type.

+ ValueToValueMap ValMap;

+ ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy);

+ for (Instruction *D : Defs) {

+ if (isa<BitCastInst>(D))

+ ValMap[D] = D->getOperand(0);

+ else

+ ValMap[D] =

+ new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());

+ }

+ for (PHINode *Phi : PhiNodes)

+ ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),

+ Phi->getName() + ".tc", Phi);

+ // Pipe together all the PhiNodes.

+ for (PHINode *Phi : PhiNodes) {

+ PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);

+ for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)

+ NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],

+ Phi->getIncomingBlock(i));

+ }

+ // And finally pipe up the stores and bitcasts

+ for (Instruction *U : Uses) {

+ if (isa<BitCastInst>(U)) {

+ DeletedInstrs.insert(U);

+ U->replaceAllUsesWith(ValMap[U->getOperand(0)]);

+ } else

+ U->setOperand(0,

+ new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));

+ }

+ // Save the removed phis to be deleted later.

+ for (PHINode *Phi : PhiNodes)

+ DeletedInstrs.insert(Phi);

+ return true;

+bool CodeGenPrepare::optimizePhiTypes(Function &F) {

+ if (!OptimizePhiTypes)

+ return false;

+ bool Changed = false;

+ SmallPtrSet<PHINode *, 4> Visited;

+ SmallPtrSet<Instruction *, 4> DeletedInstrs;

+ // Attempt to optimize all the phis in the functions to the correct type.

+ for (auto &BB : F)

+ for (auto &Phi : BB.phis())

+ Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs);

+ // Remove any old phi's that have been converted.

+ for (auto *I : DeletedInstrs) {

+ I->replaceAllUsesWith(UndefValue::get(I->getType()));

+ I->eraseFromParent();

+ }

+ return Changed;

/// Return true, if an ext(load) can be formed from an extension in

/// \p MovedExts.

bool CodeGenPrepare::canFormExtLd(

@@ -5567,11 +5942,6 @@ bool CodeGenPrepare::canFormExtLd(

/// \p Inst[in/out] the extension may be modified during the process if some

/// promotions apply.

bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {

- // ExtLoad formation and address type promotion infrastructure requires TLI to

- // be effective.

- if (!TLI)

- return false;

bool AllowPromotionWithoutCommonHeader = false;

/// See if it is an interesting sext operations for the address type

/// promotion before trying to promote it, e.g., the ones with the right

@@ -5596,16 +5966,8 @@ bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {

if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {

assert(LI && ExtFedByLoad && "Expect a valid load and extension");

TPT.commit();

- // Move the extend into the same block as the load

+ // Move the extend into the same block as the load.

ExtFedByLoad->moveAfter(LI);

- // CGP does not check if the zext would be speculatively executed when moved

- // to the same basic block as the load. Preserving its original location

- // would pessimize the debugging experience, as well as negatively impact

- // the quality of sample pgo. We don't want to use "line 0" as that has a

- // size cost in the line-table section and logically the zext can be seen as

- // part of the load. Therefore we conservatively reuse the same debug

- // location for the load and the zext.

- ExtFedByLoad->setDebugLoc(LI->getDebugLoc());

++NumExtsMoved;

Inst = ExtFedByLoad;

return true;

@@ -5633,7 +5995,7 @@ bool CodeGenPrepare::performAddressTypePromotion(

bool Promoted = false;

SmallPtrSet<Instruction *, 1> UnhandledExts;

bool AllSeenFirst = true;

- for (auto I : SpeculativelyMovedExts) {

+ for (auto *I : SpeculativelyMovedExts) {

Value *HeadOfChain = I->getOperand(0);

DenseMap<Value *, Instruction *>::iterator AlreadySeen =

SeenChainsForSExt.find(HeadOfChain);

@@ -5651,7 +6013,7 @@ bool CodeGenPrepare::performAddressTypePromotion(

TPT.commit();

if (HasPromoted)

Promoted = true;

- for (auto I : SpeculativelyMovedExts) {

+ for (auto *I : SpeculativelyMovedExts) {

Value *HeadOfChain = I->getOperand(0);

SeenChainsForSExt[HeadOfChain] = nullptr;

ValToSExtendedUses[HeadOfChain].push_back(I);

@@ -5662,7 +6024,7 @@ bool CodeGenPrepare::performAddressTypePromotion(

// This is the first chain visited from the header, keep the current chain

// as unhandled. Defer to promote this until we encounter another SExt

// chain derived from the same header.

- for (auto I : SpeculativelyMovedExts) {

+ for (auto *I : SpeculativelyMovedExts) {

Value *HeadOfChain = I->getOperand(0);

SeenChainsForSExt[HeadOfChain] = Inst;

}

@@ -5670,7 +6032,7 @@ bool CodeGenPrepare::performAddressTypePromotion(

}

if (!AllSeenFirst && !UnhandledExts.empty())

- for (auto VisitedSExt : UnhandledExts) {

+ for (auto *VisitedSExt : UnhandledExts) {

if (RemovedInsts.count(VisitedSExt))

continue;

TypePromotionTransaction TPT(RemovedInsts);

@@ -5681,7 +6043,7 @@ bool CodeGenPrepare::performAddressTypePromotion(

TPT.commit();

if (HasPromoted)

Promoted = true;

- for (auto I : Chains) {

+ for (auto *I : Chains) {

Value *HeadOfChain = I->getOperand(0);

// Mark this as handled.

SeenChainsForSExt[HeadOfChain] = nullptr;

@@ -5701,7 +6063,7 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {

return false;

// Only do this xform if truncating is free.

- if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))

+ if (!TLI->isTruncateFree(I->getType(), Src->getType()))

return false;

// Only safe to perform the optimization if the source is also defined in

@@ -5947,7 +6309,8 @@ static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {

// If it's safe to speculatively execute, then it should not have side

// effects; therefore, it's safe to sink and possibly *not* execute.

return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&

- TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;

+ TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=

+ TargetTransformInfo::TCC_Expensive;

}

/// Returns true if a SelectInst should be turned into an explicit branch.

@@ -6044,13 +6407,47 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {

return true;

}

+bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {

+ Intrinsic::ID Opcode = Fsh->getIntrinsicID();

+ assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&

+ "Expected a funnel shift");

+ // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper

+ // than general vector shifts, and (3) the shift amount is select-of-splatted

+ // values, hoist the funnel shifts before the select:

+ // fsh Op0, Op1, (select Cond, TVal, FVal) -->

+ // select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)

+ //

+ // This is inverting a generic IR transform when we know that the cost of a

+ // general vector shift is more than the cost of 2 shift-by-scalars.

+ // We can't do this effectively in SDAG because we may not be able to

+ // determine if the select operands are splats from within a basic block.

+ Type *Ty = Fsh->getType();

+ if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))

+ return false;

+ Value *Cond, *TVal, *FVal;

+ if (!match(Fsh->getOperand(2),

+ m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))

+ return false;

+ if (!isSplatValue(TVal) || !isSplatValue(FVal))

+ return false;

+ IRBuilder<> Builder(Fsh);

+ Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);

+ Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal });

+ Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal });

+ Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);

+ Fsh->replaceAllUsesWith(NewSel);

+ Fsh->eraseFromParent();

+ return true;

/// If we have a SelectInst that will likely profit from branch prediction,

/// turn it into a branch.

bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {

// If branch conversion isn't desirable, exit early.

- if (DisableSelectToBranch ||

- OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) ||

- !TLI)

+ if (DisableSelectToBranch || OptSize ||

+ llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))

return false;

// Find all consecutive select instructions that share the same condition.

@@ -6103,7 +6500,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {

// Into:

// start:

// %cmp = cmp uge i32 %a, %b

- // br i1 %cmp, label %select.true, label %select.false

+ // %cmp.frozen = freeze %cmp

+ // br i1 %cmp.frozen, label %select.true, label %select.false

// select.true:

// br label %select.end

// select.false:

@@ -6111,6 +6509,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {

// select.end:

// %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]

+ // %cmp should be frozen, otherwise it may introduce undefined behavior.

// In addition, we may sink instructions that produce %c or %d from

// the entry block into the destination(s) of the new branch.

// If the true or false blocks do not contain a sunken instruction, that

@@ -6122,6 +6521,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {

BasicBlock *StartBlock = SI->getParent();

BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));

BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");

+ BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());

// Delete the unconditional branch that was just created by the split.

StartBlock->getTerminator()->eraseFromParent();

@@ -6188,7 +6588,9 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {

TT = TrueBlock;

FT = FalseBlock;

}

- IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI);

+ IRBuilder<> IB(SI);

+ auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");

+ IB.CreateCondBr(CondFr, TT, FT, SI);

SmallPtrSet<const Instruction *, 2> INS;

INS.insert(ASI.begin(), ASI.end());

@@ -6215,79 +6617,54 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {

return true;

}

-static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {

- SmallVector<int, 16> Mask(SVI->getShuffleMask());

- int SplatElem = -1;

- for (unsigned i = 0; i < Mask.size(); ++i) {

- if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)

- return false;

- SplatElem = Mask[i];

- }

- return true;

-/// Some targets have expensive vector shifts if the lanes aren't all the same

-/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases

-/// it's often worth sinking a shufflevector splat down to its use so that

-/// codegen can spot all lanes are identical.

+/// Some targets only accept certain types for splat inputs. For example a VDUP

+/// in MVE takes a GPR (integer) register, and the instruction that incorporate

+/// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.

bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {

- BasicBlock *DefBB = SVI->getParent();

- // Only do this xform if variable vector shifts are particularly expensive.

- if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))

+ if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),

+ m_Undef(), m_ZeroMask())))

return false;

- // We only expect better codegen by sinking a shuffle if we can recognise a

- // constant splat.

- if (!isBroadcastShuffle(SVI))

+ Type *NewType = TLI->shouldConvertSplatType(SVI);

+ if (!NewType)

return false;

- // InsertedShuffles - Only insert a shuffle in each block once.

- DenseMap<BasicBlock*, Instruction*> InsertedShuffles;

- bool MadeChange = false;

- for (User *U : SVI->users()) {

- Instruction *UI = cast<Instruction>(U);

- // Figure out which BB this ext is used in.

- BasicBlock *UserBB = UI->getParent();

- if (UserBB == DefBB) continue;

- // For now only apply this when the splat is used by a shift instruction.

- if (!UI->isShift()) continue;

- // Everything checks out, sink the shuffle if the user's block doesn't

- // already have a copy.

- Instruction *&InsertedShuffle = InsertedShuffles[UserBB];

+ auto *SVIVecType = cast<FixedVectorType>(SVI->getType());

+ assert(!NewType->isVectorTy() && "Expected a scalar type!");

+ assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&

+ "Expected a type of the same size!");

+ auto *NewVecType =

+ FixedVectorType::get(NewType, SVIVecType->getNumElements());

+ // Create a bitcast (shuffle (insert (bitcast(..))))

+ IRBuilder<> Builder(SVI->getContext());

+ Builder.SetInsertPoint(SVI);

+ Value *BC1 = Builder.CreateBitCast(

+ cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);

+ Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1,

+ (uint64_t)0);

+ Value *Shuffle = Builder.CreateShuffleVector(

+ Insert, UndefValue::get(NewVecType), SVI->getShuffleMask());

+ Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);

+ SVI->replaceAllUsesWith(BC2);

+ RecursivelyDeleteTriviallyDeadInstructions(SVI);

+ // Also hoist the bitcast up to its operand if it they are not in the same

+ // block.

+ if (auto *BCI = dyn_cast<Instruction>(BC1))

+ if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0)))

+ if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) &&

+ !Op->isTerminator() && !Op->isEHPad())

+ BCI->moveAfter(Op);

- if (!InsertedShuffle) {

- BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();

- assert(InsertPt != UserBB->end());

- InsertedShuffle =

- new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),

- SVI->getOperand(2), "", &*InsertPt);

- InsertedShuffle->setDebugLoc(SVI->getDebugLoc());

- }

- UI->replaceUsesOfWith(SVI, InsertedShuffle);

- MadeChange = true;

- }

- // If we removed all uses, nuke the shuffle.

- if (SVI->use_empty()) {

- SVI->eraseFromParent();

- MadeChange = true;

- }

- return MadeChange;

+ return true;

}

bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {

// If the operands of I can be folded into a target instruction together with

// I, duplicate and sink them.

SmallVector<Use *, 4> OpsToSink;

- if (!TLI || !TLI->shouldSinkOperands(I, OpsToSink))

+ if (!TLI->shouldSinkOperands(I, OpsToSink))

return false;

// OpsToSink can contain multiple uses in a use chain (e.g.

@@ -6340,9 +6717,6 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {

}

bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {

- if (!TLI || !DL)

- return false;

Value *Cond = SI->getCondition();

Type *OldType = Cond->getType();

LLVMContext &Context = Cond->getContext();

@@ -6494,6 +6868,8 @@ class VectorPromoteHelper {

uint64_t ScalarCost =

TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);

uint64_t VectorCost = StoreExtractCombineCost;

+ enum TargetTransformInfo::TargetCostKind CostKind =

+ TargetTransformInfo::TCK_RecipThroughput;

for (const auto &Inst : InstsToBePromoted) {

// Compute the cost.

// By construction, all instructions being promoted are arithmetic ones.

@@ -6509,8 +6885,9 @@ class VectorPromoteHelper {

!IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue

: TargetTransformInfo::OK_AnyValue;

ScalarCost += TTI.getArithmeticInstrCost(

- Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);

+ Inst->getOpcode(), Inst->getType(), CostKind, Arg0OVK, Arg1OVK);

VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,

+ CostKind,

Arg0OVK, Arg1OVK);

}

LLVM_DEBUG(

@@ -6539,19 +6916,23 @@ class VectorPromoteHelper {

UseSplat = true;

}

- unsigned End = getTransitionType()->getVectorNumElements();

+ ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount();

if (UseSplat)

- return ConstantVector::getSplat(End, Val);

- SmallVector<Constant *, 4> ConstVec;

- UndefValue *UndefVal = UndefValue::get(Val->getType());

- for (unsigned Idx = 0; Idx != End; ++Idx) {

- if (Idx == ExtractIdx)

- ConstVec.push_back(Val);

- else

- ConstVec.push_back(UndefVal);

- }

- return ConstantVector::get(ConstVec);

+ return ConstantVector::getSplat(EC, Val);

+ if (!EC.Scalable) {

+ SmallVector<Constant *, 4> ConstVec;

+ UndefValue *UndefVal = UndefValue::get(Val->getType());

+ for (unsigned Idx = 0; Idx != EC.Min; ++Idx) {

+ if (Idx == ExtractIdx)

+ ConstVec.push_back(Val);

+ else

+ ConstVec.push_back(UndefVal);

+ }

+ return ConstantVector::get(ConstVec);

+ } else

+ llvm_unreachable(

+ "Generate scalable vector for non-splat is unimplemented");

}

/// Check if promoting to a vector type an operand at \p OperandIdx

@@ -6706,7 +7087,7 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {

/// has this feature and this is profitable.

bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {

unsigned CombineCost = std::numeric_limits<unsigned>::max();

- if (DisableStoreExtract || !TLI ||

+ if (DisableStoreExtract ||

(!StressStoreExtract &&

!TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),

Inst->getOperand(1), CombineCost)))

@@ -6793,6 +7174,14 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,

const TargetLowering &TLI) {

// Handle simple but common cases only.

Type *StoreType = SI.getValueOperand()->getType();

+ // The code below assumes shifting a value by <number of bits>,

+ // whereas scalable vectors would have to be shifted by

+ // <2log(vscale) + number of bits> in order to store the

+ // low/high parts. Bailing out for now.

+ if (isa<ScalableVectorType>(StoreType))

+ return false;

if (!DL.typeSizeEqualsStoreSize(StoreType) ||

DL.getTypeSizeInBits(StoreType) == 0)

return false;

@@ -6856,12 +7245,19 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,

Value *Addr = Builder.CreateBitCast(

SI.getOperand(1),

SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));

- if ((IsLE && Upper) || (!IsLE && !Upper))

+ Align Alignment = SI.getAlign();

+ const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);

+ if (IsOffsetStore) {

Addr = Builder.CreateGEP(

SplitStoreType, Addr,

ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));

- Builder.CreateAlignedStore(

- V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment());

+ // When splitting the store in half, naturally one half will retain the

+ // alignment of the original wider store, regardless of whether it was

+ // over-aligned or not, while the other will require adjustment.

+ Alignment = commonAlignment(Alignment, HalfValBitSize / 8);

+ }

+ Builder.CreateAlignedStore(V, Addr, Alignment);

};

CreateSplitStore(LValue, false);

@@ -6950,7 +7346,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,

return false;

ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));

// Check that GEPI is a cheap one.

- if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType())

+ if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(),

+ TargetTransformInfo::TCK_SizeAndLatency)

> TargetTransformInfo::TCC_Basic)

return false;

Value *GEPIOp = GEPI->getOperand(0);

@@ -6999,7 +7396,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,

cast<ConstantInt>(UGEPI->getOperand(1))->getType())

return false;

ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));

- if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType())

+ if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(),

+ TargetTransformInfo::TCK_SizeAndLatency)

> TargetTransformInfo::TCC_Basic)

return false;

UGEPIs.push_back(UGEPI);

@@ -7010,7 +7408,9 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,

for (GetElementPtrInst *UGEPI : UGEPIs) {

ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));

APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();

- unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType());

+ unsigned ImmCost =

+ TTI->getIntImmCost(NewIdx, GEPIIdx->getType(),

+ TargetTransformInfo::TCK_SizeAndLatency);

if (ImmCost > TargetTransformInfo::TCC_Basic)

return false;

}

@@ -7067,16 +7467,15 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {

if (isa<Constant>(CI->getOperand(0)))

return false;

- if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))

+ if (OptimizeNoopCopyExpression(CI, *TLI, *DL))

return true;

if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {

/// Sink a zext or sext into its user blocks if the target type doesn't

/// fit in one register

- if (TLI &&

- TLI->getTypeAction(CI->getContext(),

+ if (TLI->getTypeAction(CI->getContext(),

TLI->getValueType(*DL, CI->getType())) ==

- TargetLowering::TypeExpandInteger) {

+ TargetLowering::TypeExpandInteger) {

return SinkCast(CI);

} else {

bool MadeChange = optimizeExt(I);

@@ -7087,30 +7486,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {

}

if (auto *Cmp = dyn_cast<CmpInst>(I))

- if (TLI && optimizeCmp(Cmp, ModifiedDT))

+ if (optimizeCmp(Cmp, ModifiedDT))

return true;

if (LoadInst *LI = dyn_cast<LoadInst>(I)) {

LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);

- if (TLI) {

- bool Modified = optimizeLoadExt(LI);

- unsigned AS = LI->getPointerAddressSpace();

- Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);

- return Modified;

- }

- return false;

+ bool Modified = optimizeLoadExt(LI);

+ unsigned AS = LI->getPointerAddressSpace();

+ Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);

+ return Modified;

}

if (StoreInst *SI = dyn_cast<StoreInst>(I)) {

- if (TLI && splitMergedValStore(*SI, *DL, *TLI))

+ if (splitMergedValStore(*SI, *DL, *TLI))

return true;

SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);

- if (TLI) {

- unsigned AS = SI->getPointerAddressSpace();

- return optimizeMemoryInst(I, SI->getOperand(1),

- SI->getOperand(0)->getType(), AS);

- }

- return false;

+ unsigned AS = SI->getPointerAddressSpace();

+ return optimizeMemoryInst(I, SI->getOperand(1),

+ SI->getOperand(0)->getType(), AS);

}

if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {

@@ -7127,15 +7520,14 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {

BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);

- if (BinOp && (BinOp->getOpcode() == Instruction::And) &&

- EnableAndCmpSinking && TLI)

+ if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking)

return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);

// TODO: Move this into the switch on opcode - it handles shifts already.

if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||

BinOp->getOpcode() == Instruction::LShr)) {

ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));

- if (TLI && CI && TLI->hasExtractBitsInsn())

+ if (CI && TLI->hasExtractBitsInsn())

if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))

return true;

}

@@ -7158,6 +7550,35 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {

return false;

}

+ if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {

+ // freeze(icmp a, const)) -> icmp (freeze a), const

+ // This helps generate efficient conditional jumps.

+ Instruction *CmpI = nullptr;

+ if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0)))

+ CmpI = II;

+ else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0)))

+ CmpI = F->getFastMathFlags().none() ? F : nullptr;

+ if (CmpI && CmpI->hasOneUse()) {

+ auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1);

+ bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) ||

+ isa<ConstantPointerNull>(Op0);

+ bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) ||

+ isa<ConstantPointerNull>(Op1);

+ if (Const0 || Const1) {

+ if (!Const0 || !Const1) {

+ auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);

+ F->takeName(FI);

+ CmpI->setOperand(Const0 ? 1 : 0, F);

+ }

+ FI->replaceAllUsesWith(CmpI);

+ FI->eraseFromParent();

+ return true;

+ }

+ return false;

+ }

if (tryToSinkFreeOperands(I))

return true;

@@ -7214,7 +7635,7 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {

}

bool MadeBitReverse = true;

- while (TLI && MadeBitReverse) {

+ while (MadeBitReverse) {

MadeBitReverse = false;

for (auto &I : reverse(BB)) {

if (makeBitReverse(I, *DL, *TLI)) {

@@ -7326,7 +7747,7 @@ static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {

/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.

///

bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {

- if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive())

+ if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())

return false;

bool MadeChange = false;

@@ -7367,7 +7788,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {

LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());

// Create a new BB.

- auto TmpBB =

+ auto *TmpBB =

BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",

BB.getParent(), BB.getNextNode());