aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/CodeGen/CodeGenPrepare.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-07-26 19:36:28 +0000
commitcfca06d7963fa0909f90483b42a6d7d194d01e08 (patch)
tree209fb2a2d68f8f277793fc8df46c753d31bc853b /llvm/lib/CodeGen/CodeGenPrepare.cpp
parent706b4fc47bbc608932d3b491ae19a3b9cde9497b (diff)
Notes
Diffstat (limited to 'llvm/lib/CodeGen/CodeGenPrepare.cpp')
-rw-r--r--llvm/lib/CodeGen/CodeGenPrepare.cpp915
1 files changed, 668 insertions, 247 deletions
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index f05afd058746..e8b8e6c93cf0 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -43,7 +43,6 @@
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
@@ -61,7 +60,6 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
-#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
@@ -178,6 +176,17 @@ static cl::opt<bool> ProfileGuidedSectionPrefix(
"profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
cl::desc("Use profile info to add section prefix for hot/cold functions"));
+static cl::opt<bool> ProfileUnknownInSpecialSection(
+ "profile-unknown-in-special-section", cl::Hidden, cl::init(false),
+ cl::ZeroOrMore,
+ cl::desc("In profiling mode like sampleFDO, if a function doesn't have "
+ "profile, we cannot tell the function is cold for sure because "
+ "it may be a function newly added without ever being sampled. "
+ "With the flag enabled, compiler can put such profile unknown "
+ "functions into a special section, so runtime system can choose "
+ "to handle it in a different way than .text section, to save "
+ "RAM for example. "));
+
static cl::opt<unsigned> FreqRatioToSkipMerge(
"cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
cl::desc("Skip merging empty blocks if (frequency of empty block) / "
@@ -230,6 +239,15 @@ static cl::opt<bool> EnableICMP_EQToICMP_ST(
"cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false),
cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));
+static cl::opt<bool>
+ VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false),
+ cl::desc("Enable BFI update verification for "
+ "CodeGenPrepare."));
+
+static cl::opt<bool> OptimizePhiTypes(
+ "cgp-optimize-phi-types", cl::Hidden, cl::init(false),
+ cl::desc("Enable converting phi types in CodeGenPrepare"));
+
namespace {
enum ExtType {
@@ -327,6 +345,7 @@ class TypePromotionTransaction;
// FIXME: When we can selectively preserve passes, preserve the domtree.
AU.addRequired<ProfileSummaryInfoWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
}
@@ -368,12 +387,14 @@ class TypePromotionTransaction;
bool optimizeInst(Instruction *I, bool &ModifiedDT);
bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
Type *AccessTy, unsigned AddrSpace);
+ bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
bool optimizeInlineAsmInst(CallInst *CS);
bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
bool optimizeExt(Instruction *&I);
bool optimizeExtUses(Instruction *I);
bool optimizeLoadExt(LoadInst *Load);
bool optimizeShiftInst(BinaryOperator *BO);
+ bool optimizeFunnelShift(IntrinsicInst *Fsh);
bool optimizeSelectInst(SelectInst *SI);
bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
bool optimizeSwitchInst(SwitchInst *SI);
@@ -389,20 +410,25 @@ class TypePromotionTransaction;
unsigned CreatedInstsCost = 0);
bool mergeSExts(Function &F);
bool splitLargeGEPOffsets();
+ bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,
+ SmallPtrSetImpl<Instruction *> &DeletedInstrs);
+ bool optimizePhiTypes(Function &F);
bool performAddressTypePromotion(
Instruction *&Inst,
bool AllowPromotionWithoutCommonHeader,
bool HasPromoted, TypePromotionTransaction &TPT,
SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
bool splitBranchCondition(Function &F, bool &ModifiedDT);
- bool simplifyOffsetableRelocate(Instruction &I);
+ bool simplifyOffsetableRelocate(GCStatepointInst &I);
bool tryToSinkFreeOperands(Instruction *I);
- bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
+ bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0,
+ Value *Arg1, CmpInst *Cmp,
Intrinsic::ID IID);
bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
+ void verifyBFIUpdates(Function &F);
};
} // end anonymous namespace
@@ -428,12 +454,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
InsertedInsts.clear();
PromotedInsts.clear();
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
- TM = &TPC->getTM<TargetMachine>();
- SubtargetInfo = TM->getSubtargetImpl(F);
- TLI = SubtargetInfo->getTargetLowering();
- TRI = SubtargetInfo->getRegisterInfo();
- }
+ TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+ SubtargetInfo = TM->getSubtargetImpl(F);
+ TLI = SubtargetInfo->getTargetLowering();
+ TRI = SubtargetInfo->getRegisterInfo();
TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -446,14 +470,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
F.setSectionPrefix(".hot");
else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
F.setSectionPrefix(".unlikely");
+ else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
+ PSI->isFunctionHotnessUnknown(F))
+ F.setSectionPrefix(".unknown");
}
/// This optimization identifies DIV instructions that can be
/// profitably bypassed and carried out with a shorter, faster divide.
- if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI &&
- TLI->isSlowDivBypassed()) {
+ if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {
const DenseMap<unsigned int, unsigned int> &BypassWidths =
- TLI->getBypassSlowDivWidths();
+ TLI->getBypassSlowDivWidths();
BasicBlock* BB = &*F.begin();
while (BB != nullptr) {
// bypassSlowDivision may create new BBs, but we don't want to reapply the
@@ -495,6 +521,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
MadeChange |= mergeSExts(F);
if (!LargeOffsetGEPMap.empty())
MadeChange |= splitLargeGEPOffsets();
+ MadeChange |= optimizePhiTypes(F);
+
+ if (MadeChange)
+ eliminateFallThrough(F);
// Really free removed instructions during promotion.
for (Instruction *I : RemovedInsts)
@@ -550,11 +580,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
}
if (!DisableGCOpts) {
- SmallVector<Instruction *, 2> Statepoints;
+ SmallVector<GCStatepointInst *, 2> Statepoints;
for (BasicBlock &BB : F)
for (Instruction &I : BB)
- if (isStatepoint(I))
- Statepoints.push_back(&I);
+ if (auto *SP = dyn_cast<GCStatepointInst>(&I))
+ Statepoints.push_back(SP);
for (auto &I : Statepoints)
EverMadeChange |= simplifyOffsetableRelocate(*I);
}
@@ -563,9 +593,23 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
// preparatory transforms.
EverMadeChange |= placeDbgValues(F);
+#ifndef NDEBUG
+ if (VerifyBFIUpdates)
+ verifyBFIUpdates(F);
+#endif
+
return EverMadeChange;
}
+// Verify BFI has been updated correctly by recomputing BFI and comparing them.
+void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
+ DominatorTree NewDT(F);
+ LoopInfo NewLI(NewDT);
+ BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
+ BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);
+ NewBFI.verifyMatch(*BFI);
+}
+
/// Merge basic blocks which are connected by a single edge, where one of the
/// basic blocks has a single successor pointing to the other basic block,
/// which has a single predecessor.
@@ -749,7 +793,7 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
BlockFrequency BBFreq = BFI->getBlockFreq(BB);
- for (auto SameValueBB : SameIncomingValueBBs)
+ for (auto *SameValueBB : SameIncomingValueBBs)
if (SameValueBB->getUniquePredecessor() == Pred &&
DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
BBFreq += BFI->getBlockFreq(SameValueBB);
@@ -925,7 +969,7 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
SmallVectorImpl<Value *> &OffsetV) {
for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
// Only accept small constant integer operands
- auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
+ auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
if (!Op || Op->getZExtValue() > 20)
return false;
}
@@ -949,7 +993,7 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
// be skipped by optimization and we do not care about them.
for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
&*R != RelocatedBase; ++R)
- if (auto RI = dyn_cast<GCRelocateInst>(R))
+ if (auto *RI = dyn_cast<GCRelocateInst>(R))
if (RI->getStatepoint() == RelocatedBase->getStatepoint())
if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
RelocatedBase->moveBefore(RI);
@@ -973,7 +1017,7 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
}
Value *Base = ToReplace->getBasePtr();
- auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
+ auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
if (!Derived || Derived->getPointerOperand() != Base)
continue;
@@ -1050,10 +1094,9 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
// %base' = gc.relocate(%tok, i32 4, i32 4)
// %ptr' = gep %base' + 15
// %val = load %ptr'
-bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
+bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {
bool MadeChange = false;
SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
-
for (auto *U : I.users())
if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
// Collect all the relocate calls associated with a statepoint
@@ -1187,6 +1230,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
}
bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
+ Value *Arg0, Value *Arg1,
CmpInst *Cmp,
Intrinsic::ID IID) {
if (BO->getParent() != Cmp->getParent()) {
@@ -1204,8 +1248,6 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
}
// We allow matching the canonical IR (add X, C) back to (usubo X, -C).
- Value *Arg0 = BO->getOperand(0);
- Value *Arg1 = BO->getOperand(1);
if (BO->getOpcode() == Instruction::Add &&
IID == Intrinsic::usub_with_overflow) {
assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
@@ -1215,7 +1257,9 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
// Insert at the first instruction of the pair.
Instruction *InsertPt = nullptr;
for (Instruction &Iter : *Cmp->getParent()) {
- if (&Iter == BO || &Iter == Cmp) {
+ // If BO is an XOR, it is not guaranteed that it comes after both inputs to
+ // the overflow intrinsic are defined.
+ if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {
InsertPt = &Iter;
break;
}
@@ -1224,12 +1268,16 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
IRBuilder<> Builder(InsertPt);
Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
- Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
+ if (BO->getOpcode() != Instruction::Xor) {
+ Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
+ BO->replaceAllUsesWith(Math);
+ } else
+ assert(BO->hasOneUse() &&
+ "Patterns with XOr should use the BO only in the compare");
Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
- BO->replaceAllUsesWith(Math);
Cmp->replaceAllUsesWith(OV);
- BO->eraseFromParent();
Cmp->eraseFromParent();
+ BO->eraseFromParent();
return true;
}
@@ -1269,12 +1317,17 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
bool &ModifiedDT) {
Value *A, *B;
BinaryOperator *Add;
- if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
+ if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {
if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
return false;
+ // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.
+ A = Add->getOperand(0);
+ B = Add->getOperand(1);
+ }
if (!TLI->shouldFormOverflowOp(ISD::UADDO,
- TLI->getValueType(*DL, Add->getType())))
+ TLI->getValueType(*DL, Add->getType()),
+ Add->hasNUsesOrMore(2)))
return false;
// We don't want to move around uses of condition values this late, so we
@@ -1283,7 +1336,8 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
return false;
- if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))
+ if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,
+ Intrinsic::uadd_with_overflow))
return false;
// Reset callers - do not crash by iterating over a dead instruction.
@@ -1341,10 +1395,12 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
return false;
if (!TLI->shouldFormOverflowOp(ISD::USUBO,
- TLI->getValueType(*DL, Sub->getType())))
+ TLI->getValueType(*DL, Sub->getType()),
+ Sub->hasNUsesOrMore(2)))
return false;
- if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))
+ if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),
+ Cmp, Intrinsic::usub_with_overflow))
return false;
// Reset callers - do not crash by iterating over a dead instruction.
@@ -1813,9 +1869,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
const TargetLowering *TLI,
const DataLayout *DL,
bool &ModifiedDT) {
- if (!TLI || !DL)
- return false;
-
// If a zero input is undefined, it doesn't make sense to despeculate that.
if (match(CountZeros->getOperand(1), m_One()))
return false;
@@ -1877,7 +1930,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
// Lower inline assembly if we can.
// If we found an inline asm expession, and if the target knows how to
// lower it to normal LLVM code, do so now.
- if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
+ if (CI->isInlineAsm()) {
if (TLI->ExpandInlineAsm(CI)) {
// Avoid invalidating the iterator.
CurInstIterator = BB->begin();
@@ -1894,7 +1947,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
// Align the pointer arguments to this call if the target thinks it's a good
// idea
unsigned MinSize, PrefAlign;
- if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+ if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
for (auto &Arg : CI->arg_operands()) {
// We want to align both objects whose address is used directly and
// objects whose address is used in casts and GEPs, though it only makes
@@ -1912,7 +1965,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
AllocaInst *AI;
if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
- AI->setAlignment(MaybeAlign(PrefAlign));
+ AI->setAlignment(Align(PrefAlign));
// Global variables can only be aligned if they are defined in this
// object (i.e. they are uniquely initialized in this object), and
// over-aligning global variables that have an explicit section is
@@ -1927,12 +1980,14 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
// If this is a memcpy (or similar) then we may be able to improve the
// alignment
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
- unsigned DestAlign = getKnownAlignment(MI->getDest(), *DL);
- if (DestAlign > MI->getDestAlignment())
+ Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
+ MaybeAlign MIDestAlign = MI->getDestAlign();
+ if (!MIDestAlign || DestAlign > *MIDestAlign)
MI->setDestAlignment(DestAlign);
if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
- unsigned SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
- if (SrcAlign > MTI->getSourceAlignment())
+ MaybeAlign MTISrcAlign = MTI->getSourceAlign();
+ Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
+ if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
MTI->setSourceAlignment(SrcAlign);
}
}
@@ -1942,8 +1997,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
// cold block. This interacts with our handling for loads and stores to
// ensure that we can fold all uses of a potential addressing computation
// into their uses. TODO: generalize this to work over profiling data
- bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get());
- if (!OptForSize && CI->hasFnAttr(Attribute::Cold))
+ if (CI->hasFnAttr(Attribute::Cold) &&
+ !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
for (auto &Arg : CI->arg_operands()) {
if (!Arg->getType()->isPointerTy())
continue;
@@ -1955,10 +2010,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
if (II) {
switch (II->getIntrinsicID()) {
default: break;
+ case Intrinsic::assume: {
+ II->eraseFromParent();
+ return true;
+ }
+
case Intrinsic::experimental_widenable_condition: {
// Give up on future widening oppurtunties so that we can fold away dead
// paths and merge blocks before going into block-local instruction
- // selection.
+ // selection.
if (II->use_empty()) {
II->eraseFromParent();
return true;
@@ -2008,21 +2068,43 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
case Intrinsic::ctlz:
// If counting zeros is expensive, try to avoid it.
return despeculateCountZeros(II, TLI, DL, ModifiedDT);
+ case Intrinsic::fshl:
+ case Intrinsic::fshr:
+ return optimizeFunnelShift(II);
case Intrinsic::dbg_value:
return fixupDbgValue(II);
+ case Intrinsic::vscale: {
+ // If datalayout has no special restrictions on vector data layout,
+ // replace `llvm.vscale` by an equivalent constant expression
+ // to benefit from cheap constant propagation.
+ Type *ScalableVectorTy =
+ VectorType::get(Type::getInt8Ty(II->getContext()), 1, true);
+ if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) {
+ auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo());
+ auto *One = ConstantInt::getSigned(II->getType(), 1);
+ auto *CGep =
+ ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One);
+ II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType()));
+ II->eraseFromParent();
+ return true;
+ }
+ break;
}
-
- if (TLI) {
- SmallVector<Value*, 2> PtrOps;
- Type *AccessTy;
- if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
- while (!PtrOps.empty()) {
- Value *PtrVal = PtrOps.pop_back_val();
- unsigned AS = PtrVal->getType()->getPointerAddressSpace();
- if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
- return true;
- }
+ case Intrinsic::masked_gather:
+ return optimizeGatherScatterInst(II, II->getArgOperand(0));
+ case Intrinsic::masked_scatter:
+ return optimizeGatherScatterInst(II, II->getArgOperand(1));
}
+
+ SmallVector<Value *, 2> PtrOps;
+ Type *AccessTy;
+ if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
+ while (!PtrOps.empty()) {
+ Value *PtrVal = PtrOps.pop_back_val();
+ unsigned AS = PtrVal->getType()->getPointerAddressSpace();
+ if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
+ return true;
+ }
}
// From here on out we're working with named functions.
@@ -2033,7 +2115,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
// to fortified library functions (e.g. __memcpy_chk) that have the default
// "don't know" as the objectsize. Anything else should be left alone.
FortifiedLibCallSimplifier Simplifier(TLInfo, true);
- if (Value *V = Simplifier.optimizeCall(CI)) {
+ IRBuilder<> Builder(CI);
+ if (Value *V = Simplifier.optimizeCall(CI, Builder)) {
CI->replaceAllUsesWith(V);
CI->eraseFromParent();
return true;
@@ -2073,14 +2156,12 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
/// ret i32 %tmp2
/// @endcode
bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) {
- if (!TLI)
- return false;
-
ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
if (!RetI)
return false;
PHINode *PN = nullptr;
+ ExtractValueInst *EVI = nullptr;
BitCastInst *BCI = nullptr;
Value *V = RetI->getReturnValue();
if (V) {
@@ -2088,6 +2169,14 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
if (BCI)
V = BCI->getOperand(0);
+ EVI = dyn_cast<ExtractValueInst>(V);
+ if (EVI) {
+ V = EVI->getOperand(0);
+ if (!std::all_of(EVI->idx_begin(), EVI->idx_end(),
+ [](unsigned idx) { return idx == 0; }))
+ return false;
+ }
+
PN = dyn_cast<PHINode>(V);
if (!PN)
return false;
@@ -2101,7 +2190,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
if (PN) {
BasicBlock::iterator BI = BB->begin();
// Skip over debug and the bitcast.
- do { ++BI; } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI);
+ do {
+ ++BI;
+ } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI);
if (&*BI != RetI)
return false;
} else {
@@ -2157,6 +2248,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
// Duplicate the return into TailCallBB.
(void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
+ assert(!VerifyBFIUpdates ||
+ BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));
+ BFI->setBlockFreq(
+ BB,
+ (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());
ModifiedDT = Changed = true;
++NumRetsDup;
}
@@ -2354,6 +2450,9 @@ namespace {
/// This class provides transaction based operation on the IR.
/// Every change made through this class is recorded in the internal state and
/// can be undone (rollback) until commit is called.
+/// CGP does not check if instructions could be speculatively executed when
+/// moved. Preserving the original location would pessimize the debugging
+/// experience, as well as negatively impact the quality of sample PGO.
class TypePromotionTransaction {
/// This represents the common interface of the individual transaction.
/// Each class implements the logic for doing one specific modification on
@@ -2516,6 +2615,7 @@ class TypePromotionTransaction {
/// trunc Opnd to Ty.
TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
IRBuilder<> Builder(Opnd);
+ Builder.SetCurrentDebugLocation(DebugLoc());
Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
}
@@ -2568,6 +2668,7 @@ class TypePromotionTransaction {
ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
: TypePromotionAction(InsertPt) {
IRBuilder<> Builder(InsertPt);
+ Builder.SetCurrentDebugLocation(DebugLoc());
Val = Builder.CreateZExt(Opnd, Ty, "promoted");
LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
}
@@ -2721,8 +2822,9 @@ public:
TypePromotionTransaction(SetOfInstrs &RemovedInsts)
: RemovedInsts(RemovedInsts) {}
- /// Advocate every changes made in that transaction.
- void commit();
+ /// Advocate every changes made in that transaction. Return true if any change
+ /// happen.
+ bool commit();
/// Undo all the changes made after the given point.
void rollback(ConstRestorationPt Point);
@@ -2828,11 +2930,13 @@ TypePromotionTransaction::getRestorationPoint() const {
return !Actions.empty() ? Actions.back().get() : nullptr;
}
-void TypePromotionTransaction::commit() {
+bool TypePromotionTransaction::commit() {
for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
++It)
(*It)->commit();
+ bool Modified = !Actions.empty();
Actions.clear();
+ return Modified;
}
void TypePromotionTransaction::rollback(
@@ -3115,7 +3219,7 @@ public:
SmallPtrSet<Value *, 32> Visited;
WorkList.push_back(Val);
while (!WorkList.empty()) {
- auto P = WorkList.pop_back_val();
+ auto *P = WorkList.pop_back_val();
if (!Visited.insert(P).second)
continue;
if (auto *PI = dyn_cast<Instruction>(P))
@@ -3164,13 +3268,13 @@ public:
void destroyNewNodes(Type *CommonType) {
// For safe erasing, replace the uses with dummy value first.
- auto Dummy = UndefValue::get(CommonType);
- for (auto I : AllPhiNodes) {
+ auto *Dummy = UndefValue::get(CommonType);
+ for (auto *I : AllPhiNodes) {
I->replaceAllUsesWith(Dummy);
I->eraseFromParent();
}
AllPhiNodes.clear();
- for (auto I : AllSelectNodes) {
+ for (auto *I : AllSelectNodes) {
I->replaceAllUsesWith(Dummy);
I->eraseFromParent();
}
@@ -3511,7 +3615,7 @@ private:
// Must be a Phi node then.
auto *PHI = cast<PHINode>(V);
// Fill the Phi node with values from predecessors.
- for (auto B : predecessors(PHI->getParent())) {
+ for (auto *B : predecessors(PHI->getParent())) {
Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);
assert(Map.find(PV) != Map.end() && "No predecessor Value!");
PHI->addIncoming(ST.Get(Map[PV]), B);
@@ -3625,10 +3729,11 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
// X*Scale + C*Scale to addr mode.
ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
if (isa<Instruction>(ScaleReg) && // not a constant expr.
- match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
+ match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
+ CI->getValue().isSignedIntN(64)) {
TestAddrMode.InBounds = false;
TestAddrMode.ScaledReg = AddLHS;
- TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
+ TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;
// If this addressing mode is legal, commit it and remember that we folded
// this instruction.
@@ -3849,7 +3954,7 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
// We can get through binary operator, if it is legal. In other words, the
// binary operator must have a nuw or nsw flag.
const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
- if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+ if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) &&
((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
(IsSExt && BinOp->hasNoSignedWrap())))
return true;
@@ -4251,15 +4356,20 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
ConstantOffset += SL->getElementOffset(Idx);
} else {
- uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
- if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
- const APInt &CVal = CI->getValue();
- if (CVal.getMinSignedBits() <= 64) {
- ConstantOffset += CVal.getSExtValue() * TypeSize;
- continue;
+ TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType());
+ if (TS.isNonZero()) {
+ // The optimisations below currently only work for fixed offsets.
+ if (TS.isScalable())
+ return false;
+ int64_t TypeSize = TS.getFixedSize();
+ if (ConstantInt *CI =
+ dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+ const APInt &CVal = CI->getValue();
+ if (CVal.getMinSignedBits() <= 64) {
+ ConstantOffset += CVal.getSExtValue() * TypeSize;
+ continue;
+ }
}
- }
- if (TypeSize) { // Scales of zero don't do anything.
// We only allow one variable index at the moment.
if (VariableOperand != -1)
return false;
@@ -4422,11 +4532,13 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
TypePromotionTransaction::ConstRestorationPt LastKnownGood =
TPT.getRestorationPoint();
if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
- // Fold in immediates if legal for the target.
- AddrMode.BaseOffs += CI->getSExtValue();
- if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
- return true;
- AddrMode.BaseOffs -= CI->getSExtValue();
+ if (CI->getValue().isSignedIntN(64)) {
+ // Fold in immediates if legal for the target.
+ AddrMode.BaseOffs += CI->getSExtValue();
+ if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+ return true;
+ AddrMode.BaseOffs -= CI->getSExtValue();
+ }
} else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
// If this is a global variable, try to fold it into the addressing mode.
if (!AddrMode.BaseGV) {
@@ -4502,8 +4614,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
const TargetRegisterInfo &TRI) {
const Function *F = CI->getFunction();
TargetLowering::AsmOperandInfoVector TargetConstraints =
- TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
- ImmutableCallSite(CI));
+ TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);
for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
@@ -4581,14 +4692,16 @@ static bool FindAllMemoryUses(
}
if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
- // If this is a cold call, we can sink the addressing calculation into
- // the cold path. See optimizeCallInst
- bool OptForSize = OptSize ||
+ if (CI->hasFnAttr(Attribute::Cold)) {
+ // If this is a cold call, we can sink the addressing calculation into
+ // the cold path. See optimizeCallInst
+ bool OptForSize = OptSize ||
llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
- if (!OptForSize && CI->hasFnAttr(Attribute::Cold))
- continue;
+ if (!OptForSize)
+ continue;
+ }
- InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+ InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand());
if (!IA) return true;
// If this is a memory operand, we're cool, otherwise bail out.
@@ -4854,7 +4967,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
TPT.rollback(LastKnownGood);
return false;
}
- TPT.commit();
+ bool Modified = TPT.commit();
// Get the combined AddrMode (or the only AddrMode, if we only had one).
ExtAddrMode AddrMode = AddrModes.getAddrMode();
@@ -4868,7 +4981,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
})) {
LLVM_DEBUG(dbgs() << "CGP: Found local addrmode: " << AddrMode
<< "\n");
- return false;
+ return Modified;
}
// Insert this computation right after this user. Since our caller is
@@ -4891,7 +5004,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
if (SunkAddr->getType() != Addr->getType())
SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
} else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
- TM && SubtargetInfo->addrSinkUsingGEPs())) {
+ SubtargetInfo->addrSinkUsingGEPs())) {
// By default, we use the GEP-based method when AA is used later. This
// prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
@@ -4909,7 +5022,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
// We can't add more than one pointer together, nor can we scale a
// pointer (both of which seem meaningless).
if (ResultPtr || AddrMode.Scale != 1)
- return false;
+ return Modified;
ResultPtr = AddrMode.ScaledReg;
AddrMode.Scale = 0;
@@ -4926,12 +5039,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
Type *ScaledRegTy = AddrMode.ScaledReg->getType();
if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
cast<IntegerType>(ScaledRegTy)->getBitWidth())
- return false;
+ return Modified;
}
if (AddrMode.BaseGV) {
if (ResultPtr)
- return false;
+ return Modified;
ResultPtr = AddrMode.BaseGV;
}
@@ -4955,7 +5068,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
!AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
SunkAddr = Constant::getNullValue(Addr->getType());
} else if (!ResultPtr) {
- return false;
+ return Modified;
} else {
Type *I8PtrTy =
Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
@@ -5040,7 +5153,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
(ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
(AddrMode.BaseGV &&
DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
- return false;
+ return Modified;
LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
<< " for " << *MemoryInst << "\n");
@@ -5080,7 +5193,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
Instruction *I = dyn_cast_or_null<Instruction>(Result);
if (I && (Result != AddrMode.BaseReg))
I->eraseFromParent();
- return false;
+ return Modified;
}
if (AddrMode.Scale != 1)
V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
@@ -5142,6 +5255,119 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
return true;
}
+/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
+/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
+/// only handle a 2 operand GEP in the same basic block or a splat constant
+/// vector. The 2 operands to the GEP must have a scalar pointer and a vector
+/// index.
+///
+/// If the existing GEP has a vector base pointer that is splat, we can look
+/// through the splat to find the scalar pointer. If we can't find a scalar
+/// pointer there's nothing we can do.
+///
+/// If we have a GEP with more than 2 indices where the middle indices are all
+/// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
+///
+/// If the final index isn't a vector or is a splat, we can emit a scalar GEP
+/// followed by a GEP with an all zeroes vector index. This will enable
+/// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a
+/// zero index.
+bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
+ Value *Ptr) {
+ const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!GEP || !GEP->hasIndices())
+ return false;
+
+ // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
+ // FIXME: We should support this by sinking the GEP.
+ if (MemoryInst->getParent() != GEP->getParent())
+ return false;
+
+ SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end());
+
+ bool RewriteGEP = false;
+
+ if (Ops[0]->getType()->isVectorTy()) {
+ Ops[0] = const_cast<Value *>(getSplatValue(Ops[0]));
+ if (!Ops[0])
+ return false;
+ RewriteGEP = true;
+ }
+
+ unsigned FinalIndex = Ops.size() - 1;
+
+ // Ensure all but the last index is 0.
+ // FIXME: This isn't strictly required. All that's required is that they are
+ // all scalars or splats.
+ for (unsigned i = 1; i < FinalIndex; ++i) {
+ auto *C = dyn_cast<Constant>(Ops[i]);
+ if (!C)
+ return false;
+ if (isa<VectorType>(C->getType()))
+ C = C->getSplatValue();
+ auto *CI = dyn_cast_or_null<ConstantInt>(C);
+ if (!CI || !CI->isZero())
+ return false;
+ // Scalarize the index if needed.
+ Ops[i] = CI;
+ }
+
+ // Try to scalarize the final index.
+ if (Ops[FinalIndex]->getType()->isVectorTy()) {
+ if (Value *V = const_cast<Value *>(getSplatValue(Ops[FinalIndex]))) {
+ auto *C = dyn_cast<ConstantInt>(V);
+ // Don't scalarize all zeros vector.
+ if (!C || !C->isZero()) {
+ Ops[FinalIndex] = V;
+ RewriteGEP = true;
+ }
+ }
+ }
+
+ // If we made any changes or the we have extra operands, we need to generate
+ // new instructions.
+ if (!RewriteGEP && Ops.size() == 2)
+ return false;
+
+ unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();
+
+ IRBuilder<> Builder(MemoryInst);
+
+ Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
+
+ Value *NewAddr;
+
+ // If the final index isn't a vector, emit a scalar GEP containing all ops
+ // and a vector GEP with all zeroes final index.
+ if (!Ops[FinalIndex]->getType()->isVectorTy()) {
+ NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());
+ auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);
+ NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));
+ } else {
+ Value *Base = Ops[0];
+ Value *Index = Ops[FinalIndex];
+
+ // Create a scalar GEP if there are more than 2 operands.
+ if (Ops.size() != 2) {
+ // Replace the last index with 0.
+ Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);
+ Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());
+ }
+
+ // Now create the GEP with scalar pointer and vector index.
+ NewAddr = Builder.CreateGEP(Base, Index);
+ }
+
+ MemoryInst->replaceUsesOfWith(Ptr, NewAddr);
+
+ // If we have no uses, recursively delete the value and all dead instructions
+ // using it.
+ if (Ptr->use_empty())
+ RecursivelyDeleteTriviallyDeadInstructions(Ptr, TLInfo);
+
+ return true;
+}
+
/// If there are any memory operands, use OptimizeMemoryInst to sink their
/// address computing into the block when possible / profitable.
bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
@@ -5150,7 +5376,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
const TargetRegisterInfo *TRI =
TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
TargetLowering::AsmOperandInfoVector TargetConstraints =
- TLI->ParseConstraints(*DL, TRI, CS);
+ TLI->ParseConstraints(*DL, TRI, *CS);
unsigned ArgNo = 0;
for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
@@ -5231,7 +5457,7 @@ bool CodeGenPrepare::tryToPromoteExts(
bool Promoted = false;
// Iterate over all the extensions to try to promote them.
- for (auto I : Exts) {
+ for (auto *I : Exts) {
// Early check if we directly have ext(load).
if (isa<LoadInst>(I->getOperand(0))) {
ProfitablyMovedExts.push_back(I);
@@ -5242,7 +5468,7 @@ bool CodeGenPrepare::tryToPromoteExts(
// this check inside the for loop is to catch the case where an extension
// is directly fed by a load because in such case the extension can be moved
// up without any promotion on its operands.
- if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
+ if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)
return false;
// Get the action to perform the promotion.
@@ -5292,7 +5518,7 @@ bool CodeGenPrepare::tryToPromoteExts(
SmallVector<Instruction *, 2> NewlyMovedExts;
(void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
bool NewPromoted = false;
- for (auto ExtInst : NewlyMovedExts) {
+ for (auto *ExtInst : NewlyMovedExts) {
Instruction *MovedExt = cast<Instruction>(ExtInst);
Value *ExtOperand = MovedExt->getOperand(0);
// If we have reached to a load, we need this extra profitability check
@@ -5358,9 +5584,9 @@ bool CodeGenPrepare::mergeSExts(Function &F) {
return Changed;
}
-// Spliting large data structures so that the GEPs accessing them can have
+// Splitting large data structures so that the GEPs accessing them can have
// smaller offsets so that they can be sunk to the same blocks as their users.
-// For example, a large struct starting from %base is splitted into two parts
+// For example, a large struct starting from %base is split into two parts
// where the second part starts from %new_base.
//
// Before:
@@ -5421,7 +5647,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
Value *NewBaseGEP = nullptr;
- auto LargeOffsetGEP = LargeOffsetGEPs.begin();
+ auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
GetElementPtrInst *GEP = LargeOffsetGEP->first;
int64_t Offset = LargeOffsetGEP->second;
@@ -5435,7 +5661,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
GEP->getAddressSpace())) {
// We need to create a new base if the offset to the current base is
// too large to fit into the addressing mode. So, a very large struct
- // may be splitted into several parts.
+ // may be split into several parts.
BaseGEP = GEP;
BaseOffset = Offset;
NewBaseGEP = nullptr;
@@ -5506,6 +5732,155 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
return Changed;
}
+bool CodeGenPrepare::optimizePhiType(
+ PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,
+ SmallPtrSetImpl<Instruction *> &DeletedInstrs) {
+ // We are looking for a collection on interconnected phi nodes that together
+ // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts
+ // are of the same type. Convert the whole set of nodes to the type of the
+ // bitcast.
+ Type *PhiTy = I->getType();
+ Type *ConvertTy = nullptr;
+ if (Visited.count(I) ||
+ (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))
+ return false;
+
+ SmallVector<Instruction *, 4> Worklist;
+ Worklist.push_back(cast<Instruction>(I));
+ SmallPtrSet<PHINode *, 4> PhiNodes;
+ PhiNodes.insert(I);
+ Visited.insert(I);
+ SmallPtrSet<Instruction *, 4> Defs;
+ SmallPtrSet<Instruction *, 4> Uses;
+
+ while (!Worklist.empty()) {
+ Instruction *II = Worklist.pop_back_val();
+
+ if (auto *Phi = dyn_cast<PHINode>(II)) {
+ // Handle Defs, which might also be PHI's
+ for (Value *V : Phi->incoming_values()) {
+ if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+ if (!PhiNodes.count(OpPhi)) {
+ if (Visited.count(OpPhi))
+ return false;
+ PhiNodes.insert(OpPhi);
+ Visited.insert(OpPhi);
+ Worklist.push_back(OpPhi);
+ }
+ } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {
+ if (!Defs.count(OpLoad)) {
+ Defs.insert(OpLoad);
+ Worklist.push_back(OpLoad);
+ }
+ } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {
+ if (!Defs.count(OpEx)) {
+ Defs.insert(OpEx);
+ Worklist.push_back(OpEx);
+ }
+ } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
+ if (!ConvertTy)
+ ConvertTy = OpBC->getOperand(0)->getType();
+ if (OpBC->getOperand(0)->getType() != ConvertTy)
+ return false;
+ if (!Defs.count(OpBC)) {
+ Defs.insert(OpBC);
+ Worklist.push_back(OpBC);
+ }
+ } else if (!isa<UndefValue>(V))
+ return false;
+ }
+ }
+
+ // Handle uses which might also be phi's
+ for (User *V : II->users()) {
+ if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+ if (!PhiNodes.count(OpPhi)) {
+ if (Visited.count(OpPhi))
+ return false;
+ PhiNodes.insert(OpPhi);
+ Visited.insert(OpPhi);
+ Worklist.push_back(OpPhi);
+ }
+ } else if (auto *OpStore = dyn_cast<StoreInst>(V)) {
+ if (OpStore->getOperand(0) != II)
+ return false;
+ Uses.insert(OpStore);
+ } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
+ if (!ConvertTy)
+ ConvertTy = OpBC->getType();
+ if (OpBC->getType() != ConvertTy)
+ return false;
+ Uses.insert(OpBC);
+ } else
+ return false;
+ }
+ }
+
+ if (!ConvertTy || !TLI->shouldConvertPhiType(PhiTy, ConvertTy))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Converting " << *I << "\n and connected nodes to "
+ << *ConvertTy << "\n");
+
+ // Create all the new phi nodes of the new type, and bitcast any loads to the
+ // correct type.
+ ValueToValueMap ValMap;
+ ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy);
+ for (Instruction *D : Defs) {
+ if (isa<BitCastInst>(D))
+ ValMap[D] = D->getOperand(0);
+ else
+ ValMap[D] =
+ new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
+ }
+ for (PHINode *Phi : PhiNodes)
+ ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),
+ Phi->getName() + ".tc", Phi);
+ // Pipe together all the PhiNodes.
+ for (PHINode *Phi : PhiNodes) {
+ PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
+ for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)
+ NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],
+ Phi->getIncomingBlock(i));
+ }
+ // And finally pipe up the stores and bitcasts
+ for (Instruction *U : Uses) {
+ if (isa<BitCastInst>(U)) {
+ DeletedInstrs.insert(U);
+ U->replaceAllUsesWith(ValMap[U->getOperand(0)]);
+ } else
+ U->setOperand(0,
+ new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
+ }
+
+ // Save the removed phis to be deleted later.
+ for (PHINode *Phi : PhiNodes)
+ DeletedInstrs.insert(Phi);
+ return true;
+}
+
+bool CodeGenPrepare::optimizePhiTypes(Function &F) {
+ if (!OptimizePhiTypes)
+ return false;
+
+ bool Changed = false;
+ SmallPtrSet<PHINode *, 4> Visited;
+ SmallPtrSet<Instruction *, 4> DeletedInstrs;
+
+ // Attempt to optimize all the phis in the functions to the correct type.
+ for (auto &BB : F)
+ for (auto &Phi : BB.phis())
+ Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs);
+
+ // Remove any old phi's that have been converted.
+ for (auto *I : DeletedInstrs) {
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ I->eraseFromParent();
+ }
+
+ return Changed;
+}
+
/// Return true, if an ext(load) can be formed from an extension in
/// \p MovedExts.
bool CodeGenPrepare::canFormExtLd(
@@ -5567,11 +5942,6 @@ bool CodeGenPrepare::canFormExtLd(
/// \p Inst[in/out] the extension may be modified during the process if some
/// promotions apply.
bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
- // ExtLoad formation and address type promotion infrastructure requires TLI to
- // be effective.
- if (!TLI)
- return false;
-
bool AllowPromotionWithoutCommonHeader = false;
/// See if it is an interesting sext operations for the address type
/// promotion before trying to promote it, e.g., the ones with the right
@@ -5596,16 +5966,8 @@ bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
assert(LI && ExtFedByLoad && "Expect a valid load and extension");
TPT.commit();
- // Move the extend into the same block as the load
+ // Move the extend into the same block as the load.
ExtFedByLoad->moveAfter(LI);
- // CGP does not check if the zext would be speculatively executed when moved
- // to the same basic block as the load. Preserving its original location
- // would pessimize the debugging experience, as well as negatively impact
- // the quality of sample pgo. We don't want to use "line 0" as that has a
- // size cost in the line-table section and logically the zext can be seen as
- // part of the load. Therefore we conservatively reuse the same debug
- // location for the load and the zext.
- ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
++NumExtsMoved;
Inst = ExtFedByLoad;
return true;
@@ -5633,7 +5995,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
bool Promoted = false;
SmallPtrSet<Instruction *, 1> UnhandledExts;
bool AllSeenFirst = true;
- for (auto I : SpeculativelyMovedExts) {
+ for (auto *I : SpeculativelyMovedExts) {
Value *HeadOfChain = I->getOperand(0);
DenseMap<Value *, Instruction *>::iterator AlreadySeen =
SeenChainsForSExt.find(HeadOfChain);
@@ -5651,7 +6013,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
TPT.commit();
if (HasPromoted)
Promoted = true;
- for (auto I : SpeculativelyMovedExts) {
+ for (auto *I : SpeculativelyMovedExts) {
Value *HeadOfChain = I->getOperand(0);
SeenChainsForSExt[HeadOfChain] = nullptr;
ValToSExtendedUses[HeadOfChain].push_back(I);
@@ -5662,7 +6024,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
// This is the first chain visited from the header, keep the current chain
// as unhandled. Defer to promote this until we encounter another SExt
// chain derived from the same header.
- for (auto I : SpeculativelyMovedExts) {
+ for (auto *I : SpeculativelyMovedExts) {
Value *HeadOfChain = I->getOperand(0);
SeenChainsForSExt[HeadOfChain] = Inst;
}
@@ -5670,7 +6032,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
}
if (!AllSeenFirst && !UnhandledExts.empty())
- for (auto VisitedSExt : UnhandledExts) {
+ for (auto *VisitedSExt : UnhandledExts) {
if (RemovedInsts.count(VisitedSExt))
continue;
TypePromotionTransaction TPT(RemovedInsts);
@@ -5681,7 +6043,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
TPT.commit();
if (HasPromoted)
Promoted = true;
- for (auto I : Chains) {
+ for (auto *I : Chains) {
Value *HeadOfChain = I->getOperand(0);
// Mark this as handled.
SeenChainsForSExt[HeadOfChain] = nullptr;
@@ -5701,7 +6063,7 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
return false;
// Only do this xform if truncating is free.
- if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
+ if (!TLI->isTruncateFree(I->getType(), Src->getType()))
return false;
// Only safe to perform the optimization if the source is also defined in
@@ -5947,7 +6309,8 @@ static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
// If it's safe to speculatively execute, then it should not have side
// effects; therefore, it's safe to sink and possibly *not* execute.
return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
- TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
+ TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=
+ TargetTransformInfo::TCC_Expensive;
}
/// Returns true if a SelectInst should be turned into an explicit branch.
@@ -6044,13 +6407,47 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
return true;
}
+bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
+ Intrinsic::ID Opcode = Fsh->getIntrinsicID();
+ assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
+ "Expected a funnel shift");
+
+ // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
+ // than general vector shifts, and (3) the shift amount is select-of-splatted
+ // values, hoist the funnel shifts before the select:
+ // fsh Op0, Op1, (select Cond, TVal, FVal) -->
+ // select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
+ //
+ // This is inverting a generic IR transform when we know that the cost of a
+ // general vector shift is more than the cost of 2 shift-by-scalars.
+ // We can't do this effectively in SDAG because we may not be able to
+ // determine if the select operands are splats from within a basic block.
+ Type *Ty = Fsh->getType();
+ if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
+ return false;
+ Value *Cond, *TVal, *FVal;
+ if (!match(Fsh->getOperand(2),
+ m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+ return false;
+ if (!isSplatValue(TVal) || !isSplatValue(FVal))
+ return false;
+
+ IRBuilder<> Builder(Fsh);
+ Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);
+ Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal });
+ Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal });
+ Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
+ Fsh->replaceAllUsesWith(NewSel);
+ Fsh->eraseFromParent();
+ return true;
+}
+
/// If we have a SelectInst that will likely profit from branch prediction,
/// turn it into a branch.
bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
// If branch conversion isn't desirable, exit early.
- if (DisableSelectToBranch ||
- OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) ||
- !TLI)
+ if (DisableSelectToBranch || OptSize ||
+ llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))
return false;
// Find all consecutive select instructions that share the same condition.
@@ -6103,7 +6500,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
// Into:
// start:
// %cmp = cmp uge i32 %a, %b
- // br i1 %cmp, label %select.true, label %select.false
+ // %cmp.frozen = freeze %cmp
+ // br i1 %cmp.frozen, label %select.true, label %select.false
// select.true:
// br label %select.end
// select.false:
@@ -6111,6 +6509,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
// select.end:
// %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
//
+ // %cmp should be frozen, otherwise it may introduce undefined behavior.
// In addition, we may sink instructions that produce %c or %d from
// the entry block into the destination(s) of the new branch.
// If the true or false blocks do not contain a sunken instruction, that
@@ -6122,6 +6521,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
BasicBlock *StartBlock = SI->getParent();
BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+ BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
// Delete the unconditional branch that was just created by the split.
StartBlock->getTerminator()->eraseFromParent();
@@ -6188,7 +6588,9 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
TT = TrueBlock;
FT = FalseBlock;
}
- IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI);
+ IRBuilder<> IB(SI);
+ auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
+ IB.CreateCondBr(CondFr, TT, FT, SI);
SmallPtrSet<const Instruction *, 2> INS;
INS.insert(ASI.begin(), ASI.end());
@@ -6215,79 +6617,54 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
return true;
}
-static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
- SmallVector<int, 16> Mask(SVI->getShuffleMask());
- int SplatElem = -1;
- for (unsigned i = 0; i < Mask.size(); ++i) {
- if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
- return false;
- SplatElem = Mask[i];
- }
-
- return true;
-}
-
-/// Some targets have expensive vector shifts if the lanes aren't all the same
-/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
-/// it's often worth sinking a shufflevector splat down to its use so that
-/// codegen can spot all lanes are identical.
+/// Some targets only accept certain types for splat inputs. For example a VDUP
+/// in MVE takes a GPR (integer) register, and the instruction that incorporate
+/// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.
bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
- BasicBlock *DefBB = SVI->getParent();
-
- // Only do this xform if variable vector shifts are particularly expensive.
- if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))
+ if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
+ m_Undef(), m_ZeroMask())))
return false;
-
- // We only expect better codegen by sinking a shuffle if we can recognise a
- // constant splat.
- if (!isBroadcastShuffle(SVI))
+ Type *NewType = TLI->shouldConvertSplatType(SVI);
+ if (!NewType)
return false;
- // InsertedShuffles - Only insert a shuffle in each block once.
- DenseMap<BasicBlock*, Instruction*> InsertedShuffles;
-
- bool MadeChange = false;
- for (User *U : SVI->users()) {
- Instruction *UI = cast<Instruction>(U);
-
- // Figure out which BB this ext is used in.
- BasicBlock *UserBB = UI->getParent();
- if (UserBB == DefBB) continue;
-
- // For now only apply this when the splat is used by a shift instruction.
- if (!UI->isShift()) continue;
-
- // Everything checks out, sink the shuffle if the user's block doesn't
- // already have a copy.
- Instruction *&InsertedShuffle = InsertedShuffles[UserBB];
+ auto *SVIVecType = cast<FixedVectorType>(SVI->getType());
+ assert(!NewType->isVectorTy() && "Expected a scalar type!");
+ assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&
+ "Expected a type of the same size!");
+ auto *NewVecType =
+ FixedVectorType::get(NewType, SVIVecType->getNumElements());
+
+ // Create a bitcast (shuffle (insert (bitcast(..))))
+ IRBuilder<> Builder(SVI->getContext());
+ Builder.SetInsertPoint(SVI);
+ Value *BC1 = Builder.CreateBitCast(
+ cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);
+ Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1,
+ (uint64_t)0);
+ Value *Shuffle = Builder.CreateShuffleVector(
+ Insert, UndefValue::get(NewVecType), SVI->getShuffleMask());
+ Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);
+
+ SVI->replaceAllUsesWith(BC2);
+ RecursivelyDeleteTriviallyDeadInstructions(SVI);
+
+ // Also hoist the bitcast up to its operand if it they are not in the same
+ // block.
+ if (auto *BCI = dyn_cast<Instruction>(BC1))
+ if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0)))
+ if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) &&
+ !Op->isTerminator() && !Op->isEHPad())
+ BCI->moveAfter(Op);
- if (!InsertedShuffle) {
- BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
- assert(InsertPt != UserBB->end());
- InsertedShuffle =
- new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
- SVI->getOperand(2), "", &*InsertPt);
- InsertedShuffle->setDebugLoc(SVI->getDebugLoc());
- }
-
- UI->replaceUsesOfWith(SVI, InsertedShuffle);
- MadeChange = true;
- }
-
- // If we removed all uses, nuke the shuffle.
- if (SVI->use_empty()) {
- SVI->eraseFromParent();
- MadeChange = true;
- }
-
- return MadeChange;
+ return true;
}
bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
// If the operands of I can be folded into a target instruction together with
// I, duplicate and sink them.
SmallVector<Use *, 4> OpsToSink;
- if (!TLI || !TLI->shouldSinkOperands(I, OpsToSink))
+ if (!TLI->shouldSinkOperands(I, OpsToSink))
return false;
// OpsToSink can contain multiple uses in a use chain (e.g.
@@ -6340,9 +6717,6 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
}
bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
- if (!TLI || !DL)
- return false;
-
Value *Cond = SI->getCondition();
Type *OldType = Cond->getType();
LLVMContext &Context = Cond->getContext();
@@ -6494,6 +6868,8 @@ class VectorPromoteHelper {
uint64_t ScalarCost =
TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
uint64_t VectorCost = StoreExtractCombineCost;
+ enum TargetTransformInfo::TargetCostKind CostKind =
+ TargetTransformInfo::TCK_RecipThroughput;
for (const auto &Inst : InstsToBePromoted) {
// Compute the cost.
// By construction, all instructions being promoted are arithmetic ones.
@@ -6509,8 +6885,9 @@ class VectorPromoteHelper {
!IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
: TargetTransformInfo::OK_AnyValue;
ScalarCost += TTI.getArithmeticInstrCost(
- Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
+ Inst->getOpcode(), Inst->getType(), CostKind, Arg0OVK, Arg1OVK);
VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
+ CostKind,
Arg0OVK, Arg1OVK);
}
LLVM_DEBUG(
@@ -6539,19 +6916,23 @@ class VectorPromoteHelper {
UseSplat = true;
}
- unsigned End = getTransitionType()->getVectorNumElements();
+ ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount();
if (UseSplat)
- return ConstantVector::getSplat(End, Val);
-
- SmallVector<Constant *, 4> ConstVec;
- UndefValue *UndefVal = UndefValue::get(Val->getType());
- for (unsigned Idx = 0; Idx != End; ++Idx) {
- if (Idx == ExtractIdx)
- ConstVec.push_back(Val);
- else
- ConstVec.push_back(UndefVal);
- }
- return ConstantVector::get(ConstVec);
+ return ConstantVector::getSplat(EC, Val);
+
+ if (!EC.Scalable) {
+ SmallVector<Constant *, 4> ConstVec;
+ UndefValue *UndefVal = UndefValue::get(Val->getType());
+ for (unsigned Idx = 0; Idx != EC.Min; ++Idx) {
+ if (Idx == ExtractIdx)
+ ConstVec.push_back(Val);
+ else
+ ConstVec.push_back(UndefVal);
+ }
+ return ConstantVector::get(ConstVec);
+ } else
+ llvm_unreachable(
+ "Generate scalable vector for non-splat is unimplemented");
}
/// Check if promoting to a vector type an operand at \p OperandIdx
@@ -6706,7 +7087,7 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
/// has this feature and this is profitable.
bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
unsigned CombineCost = std::numeric_limits<unsigned>::max();
- if (DisableStoreExtract || !TLI ||
+ if (DisableStoreExtract ||
(!StressStoreExtract &&
!TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
Inst->getOperand(1), CombineCost)))
@@ -6793,6 +7174,14 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
const TargetLowering &TLI) {
// Handle simple but common cases only.
Type *StoreType = SI.getValueOperand()->getType();
+
+ // The code below assumes shifting a value by <number of bits>,
+ // whereas scalable vectors would have to be shifted by
+ // <2log(vscale) + number of bits> in order to store the
+ // low/high parts. Bailing out for now.
+ if (isa<ScalableVectorType>(StoreType))
+ return false;
+
if (!DL.typeSizeEqualsStoreSize(StoreType) ||
DL.getTypeSizeInBits(StoreType) == 0)
return false;
@@ -6856,12 +7245,19 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
Value *Addr = Builder.CreateBitCast(
SI.getOperand(1),
SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
- if ((IsLE && Upper) || (!IsLE && !Upper))
+ Align Alignment = SI.getAlign();
+ const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);
+ if (IsOffsetStore) {
Addr = Builder.CreateGEP(
SplitStoreType, Addr,
ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
- Builder.CreateAlignedStore(
- V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment());
+
+ // When splitting the store in half, naturally one half will retain the
+ // alignment of the original wider store, regardless of whether it was
+ // over-aligned or not, while the other will require adjustment.
+ Alignment = commonAlignment(Alignment, HalfValBitSize / 8);
+ }
+ Builder.CreateAlignedStore(V, Addr, Alignment);
};
CreateSplitStore(LValue, false);
@@ -6950,7 +7346,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
return false;
ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
// Check that GEPI is a cheap one.
- if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType())
+ if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(),
+ TargetTransformInfo::TCK_SizeAndLatency)
> TargetTransformInfo::TCC_Basic)
return false;
Value *GEPIOp = GEPI->getOperand(0);
@@ -6999,7 +7396,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
cast<ConstantInt>(UGEPI->getOperand(1))->getType())
return false;
ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
- if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType())
+ if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(),
+ TargetTransformInfo::TCK_SizeAndLatency)
> TargetTransformInfo::TCC_Basic)
return false;
UGEPIs.push_back(UGEPI);
@@ -7010,7 +7408,9 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
for (GetElementPtrInst *UGEPI : UGEPIs) {
ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
- unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType());
+ unsigned ImmCost =
+ TTI->getIntImmCost(NewIdx, GEPIIdx->getType(),
+ TargetTransformInfo::TCK_SizeAndLatency);
if (ImmCost > TargetTransformInfo::TCC_Basic)
return false;
}
@@ -7067,16 +7467,15 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
if (isa<Constant>(CI->getOperand(0)))
return false;
- if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))
+ if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
return true;
if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
/// Sink a zext or sext into its user blocks if the target type doesn't
/// fit in one register
- if (TLI &&
- TLI->getTypeAction(CI->getContext(),
+ if (TLI->getTypeAction(CI->getContext(),
TLI->getValueType(*DL, CI->getType())) ==
- TargetLowering::TypeExpandInteger) {
+ TargetLowering::TypeExpandInteger) {
return SinkCast(CI);
} else {
bool MadeChange = optimizeExt(I);
@@ -7087,30 +7486,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
}
if (auto *Cmp = dyn_cast<CmpInst>(I))
- if (TLI && optimizeCmp(Cmp, ModifiedDT))
+ if (optimizeCmp(Cmp, ModifiedDT))
return true;
if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
- if (TLI) {
- bool Modified = optimizeLoadExt(LI);
- unsigned AS = LI->getPointerAddressSpace();
- Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
- return Modified;
- }
- return false;
+ bool Modified = optimizeLoadExt(LI);
+ unsigned AS = LI->getPointerAddressSpace();
+ Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+ return Modified;
}
if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- if (TLI && splitMergedValStore(*SI, *DL, *TLI))
+ if (splitMergedValStore(*SI, *DL, *TLI))
return true;
SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
- if (TLI) {
- unsigned AS = SI->getPointerAddressSpace();
- return optimizeMemoryInst(I, SI->getOperand(1),
- SI->getOperand(0)->getType(), AS);
- }
- return false;
+ unsigned AS = SI->getPointerAddressSpace();
+ return optimizeMemoryInst(I, SI->getOperand(1),
+ SI->getOperand(0)->getType(), AS);
}
if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
@@ -7127,15 +7520,14 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
- if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
- EnableAndCmpSinking && TLI)
+ if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking)
return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
// TODO: Move this into the switch on opcode - it handles shifts already.
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
BinOp->getOpcode() == Instruction::LShr)) {
ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
- if (TLI && CI && TLI->hasExtractBitsInsn())
+ if (CI && TLI->hasExtractBitsInsn())
if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))
return true;
}
@@ -7158,6 +7550,35 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
return false;
}
+ if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
+ // freeze(icmp a, const)) -> icmp (freeze a), const
+ // This helps generate efficient conditional jumps.
+ Instruction *CmpI = nullptr;
+ if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0)))
+ CmpI = II;
+ else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0)))
+ CmpI = F->getFastMathFlags().none() ? F : nullptr;
+
+ if (CmpI && CmpI->hasOneUse()) {
+ auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1);
+ bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) ||
+ isa<ConstantPointerNull>(Op0);
+ bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) ||
+ isa<ConstantPointerNull>(Op1);
+ if (Const0 || Const1) {
+ if (!Const0 || !Const1) {
+ auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);
+ F->takeName(FI);
+ CmpI->setOperand(Const0 ? 1 : 0, F);
+ }
+ FI->replaceAllUsesWith(CmpI);
+ FI->eraseFromParent();
+ return true;
+ }
+ }
+ return false;
+ }
+
if (tryToSinkFreeOperands(I))
return true;
@@ -7214,7 +7635,7 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
}
bool MadeBitReverse = true;
- while (TLI && MadeBitReverse) {
+ while (MadeBitReverse) {
MadeBitReverse = false;
for (auto &I : reverse(BB)) {
if (makeBitReverse(I, *DL, *TLI)) {
@@ -7326,7 +7747,7 @@ static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
///
bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
- if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive())
+ if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())
return false;
bool MadeChange = false;
@@ -7367,7 +7788,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
// Create a new BB.
- auto TmpBB =
+ auto *TmpBB =
BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
BB.getParent(), BB.getNextNode());