diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 207 |
1 files changed, 179 insertions, 28 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 64b41bf9cefa..787f146bdddc 100644 --- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" @@ -33,6 +34,7 @@ using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "vector-combine" +STATISTIC(NumVecLoad, "Number of vector loads formed"); STATISTIC(NumVecCmp, "Number of vector compares formed"); STATISTIC(NumVecBO, "Number of vector binops formed"); STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed"); @@ -65,6 +67,7 @@ private: const TargetTransformInfo &TTI; const DominatorTree &DT; + bool vectorizeLoadInsert(Instruction &I); ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, ExtractElementInst *Ext1, unsigned PreferredExtractIndex) const; @@ -88,6 +91,138 @@ static void replaceValue(Value &Old, Value &New) { New.takeName(&Old); } +bool VectorCombine::vectorizeLoadInsert(Instruction &I) { + // Match insert into fixed vector of scalar value. + // TODO: Handle non-zero insert index. + auto *Ty = dyn_cast<FixedVectorType>(I.getType()); + Value *Scalar; + if (!Ty || !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || + !Scalar->hasOneUse()) + return false; + + // Optionally match an extract from another vector. + Value *X; + bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt())); + if (!HasExtract) + X = Scalar; + + // Match source value as load of scalar or vector. + // Do not vectorize scalar load (widening) if atomic/volatile or under + // asan/hwasan/memtag/tsan. The widened load may load data from dirty regions + // or create data races non-existent in the source. + auto *Load = dyn_cast<LoadInst>(X); + if (!Load || !Load->isSimple() || !Load->hasOneUse() || + Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || + mustSuppressSpeculation(*Load)) + return false; + + const DataLayout &DL = I.getModule()->getDataLayout(); + Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); + assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type"); + + // If original AS != Load's AS, we can't bitcast the original pointer and have + // to use Load's operand instead. Ideally we would want to strip pointer casts + // without changing AS, but there's no API to do that ATM. + unsigned AS = Load->getPointerAddressSpace(); + if (AS != SrcPtr->getType()->getPointerAddressSpace()) + SrcPtr = Load->getPointerOperand(); + + // We are potentially transforming byte-sized (8-bit) memory accesses, so make + // sure we have all of our type-based constraints in place for this target. + Type *ScalarTy = Scalar->getType(); + uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); + unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); + if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || + ScalarSize % 8 != 0) + return false; + + // Check safety of replacing the scalar load with a larger vector load. + // We use minimal alignment (maximum flexibility) because we only care about + // the dereferenceable region. When calculating cost and creating a new op, + // we may use a larger value based on alignment attributes. + unsigned MinVecNumElts = MinVectorSize / ScalarSize; + auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false); + unsigned OffsetEltIndex = 0; + Align Alignment = Load->getAlign(); + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) { + // It is not safe to load directly from the pointer, but we can still peek + // through gep offsets and check if it safe to load from a base address with + // updated alignment. If it is, we can shuffle the element(s) into place + // after loading. + unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType()); + APInt Offset(OffsetBitWidth, 0); + SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + + // We want to shuffle the result down from a high element of a vector, so + // the offset must be positive. + if (Offset.isNegative()) + return false; + + // The offset must be a multiple of the scalar element to shuffle cleanly + // in the element's size. + uint64_t ScalarSizeInBytes = ScalarSize / 8; + if (Offset.urem(ScalarSizeInBytes) != 0) + return false; + + // If we load MinVecNumElts, will our target element still be loaded? + OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue(); + if (OffsetEltIndex >= MinVecNumElts) + return false; + + if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &DT)) + return false; + + // Update alignment with offset value. Note that the offset could be negated + // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but + // negation does not change the result of the alignment calculation. + Alignment = commonAlignment(Alignment, Offset.getZExtValue()); + } + + // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 + // Use the greater of the alignment on the load or its source pointer. + Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); + Type *LoadTy = Load->getType(); + InstructionCost OldCost = + TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS); + APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0); + OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, + /* Insert */ true, HasExtract); + + // New pattern: load VecPtr + InstructionCost NewCost = + TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS); + // Optionally, we are shuffling the loaded vector element(s) into place. + if (OffsetEltIndex) + NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy); + + // We can aggressively convert to the vector form because the backend can + // invert this transform if it does not result in a performance win. + if (OldCost < NewCost || !NewCost.isValid()) + return false; + + // It is safe and potentially profitable to load a vector directly: + // inselt undef, load Scalar, 0 --> load VecPtr + IRBuilder<> Builder(Load); + Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); + Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); + + // Set everything but element 0 to undef to prevent poison from propagating + // from the extra loaded memory. This will also optionally shrink/grow the + // vector from the loaded size to the output size. + // We assume this operation has no cost in codegen if there was no offset. + // Note that we could use freeze to avoid poison problems, but then we might + // still need a shuffle to change the vector size. + unsigned OutputNumElts = Ty->getNumElements(); + SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); + assert(OffsetEltIndex < MinVecNumElts && "Address offset too big"); + Mask[0] = OffsetEltIndex; + VecLd = Builder.CreateShuffleVector(VecLd, Mask); + + replaceValue(I, *VecLd); + ++NumVecLoad; + return true; +} + /// Determine which, if any, of the inputs should be replaced by a shuffle /// followed by extract from a different index. ExtractElementInst *VectorCombine::getShuffleExtract( @@ -106,8 +241,14 @@ ExtractElementInst *VectorCombine::getShuffleExtract( Type *VecTy = Ext0->getVectorOperand()->getType(); assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types"); - int Cost0 = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); - int Cost1 = TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + InstructionCost Cost0 = + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + InstructionCost Cost1 = + TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); + + // If both costs are invalid no shuffle is needed + if (!Cost0.isValid() && !Cost1.isValid()) + return nullptr; // We are extracting from 2 different indexes, so one operand must be shuffled // before performing a vector operation and/or extract. The more expensive @@ -143,7 +284,7 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, "Expected constant extract indexes"); Type *ScalarTy = Ext0->getType(); auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType()); - int ScalarOpCost, VectorOpCost; + InstructionCost ScalarOpCost, VectorOpCost; // Get cost estimates for scalar and vector versions of the operation. bool IsBinOp = Instruction::isBinaryOp(Opcode); @@ -164,9 +305,9 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue(); unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue(); - int Extract0Cost = + InstructionCost Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index); - int Extract1Cost = + InstructionCost Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index); // A more expensive extract will always be replaced by a splat shuffle. @@ -176,11 +317,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // TODO: Evaluate whether that always results in lowest cost. Alternatively, // check the cost of creating a broadcast shuffle and shuffling both // operands to element 0. - int CheapExtractCost = std::min(Extract0Cost, Extract1Cost); + InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); // Extra uses of the extracts mean that we include those costs in the // vector total because those instructions will not be eliminated. - int OldCost, NewCost; + InstructionCost OldCost, NewCost; if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) { // Handle a special case. If the 2 extracts are identical, adjust the // formulas to account for that. The extra use charge allows for either the @@ -231,8 +372,7 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, auto *VecTy = cast<FixedVectorType>(Vec->getType()); SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem); ShufMask[NewIndex] = OldIndex; - Value *Undef = UndefValue::get(VecTy); - return Builder.CreateShuffleVector(Vec, Undef, ShufMask, "shift"); + return Builder.CreateShuffleVector(Vec, ShufMask, "shift"); } /// Given an extract element instruction with constant index operand, shuffle @@ -366,17 +506,23 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask)))))) return false; - // Disallow non-vector casts and length-changing shuffles. + // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for + // scalable type is unknown; Second, we cannot reason if the narrowed shuffle + // mask for scalable type is a splat or not. + // 2) Disallow non-vector casts and length-changing shuffles. // TODO: We could allow any shuffle. - auto *DestTy = dyn_cast<VectorType>(I.getType()); - auto *SrcTy = cast<VectorType>(V->getType()); - if (!DestTy || I.getOperand(0)->getType() != SrcTy) + auto *DestTy = dyn_cast<FixedVectorType>(I.getType()); + auto *SrcTy = dyn_cast<FixedVectorType>(V->getType()); + if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy) return false; // The new shuffle must not cost more than the old shuffle. The bitcast is // moved ahead of the shuffle, so assume that it has the same cost as before. - if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) > - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy)) + InstructionCost DestCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy); + InstructionCost SrcCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy); + if (DestCost > SrcCost || !DestCost.isValid()) return false; unsigned DestNumElts = DestTy->getNumElements(); @@ -399,8 +545,7 @@ bool VectorCombine::foldBitcastShuf(Instruction &I) { // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' ++NumShufOfBitcast; Value *CastV = Builder.CreateBitCast(V, DestTy); - Value *Shuf = - Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask); + Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask); replaceValue(I, *Shuf); return true; } @@ -467,7 +612,7 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { "Unexpected types for insert element into binop or cmp"); unsigned Opcode = I.getOpcode(); - int ScalarOpCost, VectorOpCost; + InstructionCost ScalarOpCost, VectorOpCost; if (IsCmp) { ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy); VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy); @@ -478,16 +623,16 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { // Get cost estimate for the insert element. This cost will factor into // both sequences. - int InsertCost = + InstructionCost InsertCost = TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index); - int OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + - VectorOpCost; - int NewCost = ScalarOpCost + InsertCost + - (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + - (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); + InstructionCost OldCost = + (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; + InstructionCost NewCost = ScalarOpCost + InsertCost + + (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + + (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); // We want to scalarize unless the vector variant actually has lower cost. - if (OldCost < NewCost) + if (OldCost < NewCost || !NewCost.isValid()) return false; // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> @@ -567,7 +712,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { if (!VecTy) return false; - int OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); + InstructionCost OldCost = + TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0); OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1); OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2; OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType()); @@ -578,7 +724,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType())); - int NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); + InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType()); NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy); NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy); @@ -587,7 +733,7 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) { // Aggressively form vector ops if the cost is equal because the transform // may enable further optimization. // Codegen can reverse this transform (scalarize) if it was not profitable. - if (OldCost < NewCost) + if (OldCost < NewCost || !NewCost.isValid()) return false; // Create a vector constant from the 2 scalar constants. @@ -612,6 +758,10 @@ bool VectorCombine::run() { if (DisableVectorCombine) return false; + // Don't attempt vectorization if the target does not support vectors. + if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true))) + return false; + bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. @@ -625,6 +775,7 @@ bool VectorCombine::run() { if (isa<DbgInfoIntrinsic>(I)) continue; Builder.SetInsertPoint(&I); + MadeChange |= vectorizeLoadInsert(I); MadeChange |= foldExtractExtract(I); MadeChange |= foldBitcastShuf(I); MadeChange |= scalarizeBinopOrCmp(I); |