diff options
Diffstat (limited to 'lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp')
-rw-r--r-- | lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp | 112 |
1 files changed, 78 insertions, 34 deletions
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 9bf87d024607..e0d85c4b49ae 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1,9 +1,8 @@ //===- InstCombineSimplifyDemanded.cpp ------------------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -366,10 +365,9 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits InputKnown(SrcBitWidth); if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1)) return I; - Known = InputKnown.zextOrTrunc(BitWidth); - // Any top bits are known to be zero. - if (BitWidth > SrcBitWidth) - Known.Zero.setBitsFrom(SrcBitWidth); + assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?"); + Known = InputKnown.zextOrTrunc(BitWidth, + true /* ExtendedBitsAreKnownZero */); assert(!Known.hasConflict() && "Bits known to be one AND zero?"); break; } @@ -967,26 +965,16 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1, } /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. +/// +/// Note: This only supports non-TFE/LWE image intrinsic calls; those have +/// struct returns. Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, APInt DemandedElts, - int DMaskIdx, - int TFCIdx) { + int DMaskIdx) { unsigned VWidth = II->getType()->getVectorNumElements(); if (VWidth == 1) return nullptr; - // Need to change to new instruction format - ConstantInt *TFC = nullptr; - bool TFELWEEnabled = false; - if (TFCIdx > 0) { - TFC = dyn_cast<ConstantInt>(II->getArgOperand(TFCIdx)); - TFELWEEnabled = TFC->getZExtValue() & 0x1 // TFE - || TFC->getZExtValue() & 0x2; // LWE - } - - if (TFELWEEnabled) - return nullptr; // TFE not yet supported - ConstantInt *NewDMask = nullptr; if (DMaskIdx < 0) { @@ -994,10 +982,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, // below. DemandedElts = (1 << DemandedElts.getActiveBits()) - 1; } else { - ConstantInt *DMask = dyn_cast<ConstantInt>(II->getArgOperand(DMaskIdx)); - if (!DMask) - return nullptr; // non-constant dmask is not supported by codegen - + ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx)); unsigned DMaskVal = DMask->getZExtValue() & 0xf; // Mask off values that are undefined because the dmask doesn't cover them @@ -1018,8 +1003,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal); } - // TODO: Handle 3 vectors when supported in code gen. - unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countPopulation()); + unsigned NewNumElts = DemandedElts.countPopulation(); if (!NewNumElts) return UndefValue::get(II->getType()); @@ -1035,13 +1019,12 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II, getIntrinsicInfoTableEntries(IID, Table); ArrayRef<Intrinsic::IITDescriptor> TableRef = Table; + // Validate function argument and return types, extracting overloaded types + // along the way. FunctionType *FTy = II->getCalledFunction()->getFunctionType(); SmallVector<Type *, 6> OverloadTys; - Intrinsic::matchIntrinsicType(FTy->getReturnType(), TableRef, OverloadTys); - for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) - Intrinsic::matchIntrinsicType(FTy->getParamType(i), TableRef, OverloadTys); + Intrinsic::matchIntrinsicSignature(FTy, TableRef, OverloadTys); - // Get the new return type overload of the intrinsic. Module *M = II->getParent()->getParent()->getParent(); Type *EltTy = II->getType()->getVectorElementType(); Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts); @@ -1184,6 +1167,39 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, switch (I->getOpcode()) { default: break; + case Instruction::GetElementPtr: { + // The LangRef requires that struct geps have all constant indices. As + // such, we can't convert any operand to partial undef. + auto mayIndexStructType = [](GetElementPtrInst &GEP) { + for (auto I = gep_type_begin(GEP), E = gep_type_end(GEP); + I != E; I++) + if (I.isStruct()) + return true;; + return false; + }; + if (mayIndexStructType(cast<GetElementPtrInst>(*I))) + break; + + // Conservatively track the demanded elements back through any vector + // operands we may have. We know there must be at least one, or we + // wouldn't have a vector result to get here. Note that we intentionally + // merge the undef bits here since gepping with either an undef base or + // index results in undef. + for (unsigned i = 0; i < I->getNumOperands(); i++) { + if (isa<UndefValue>(I->getOperand(i))) { + // If the entire vector is undefined, just return this info. + UndefElts = EltMask; + return nullptr; + } + if (I->getOperand(i)->getType()->isVectorTy()) { + APInt UndefEltsOp(VWidth, 0); + simplifyAndSetOp(I, i, DemandedElts, UndefEltsOp); + UndefElts |= UndefEltsOp; + } + } + + break; + } case Instruction::InsertElement: { // If this is a variable index, we don't know which element it overwrites. // demand exactly the same input as we produce. @@ -1430,6 +1446,30 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); if (!II) break; switch (II->getIntrinsicID()) { + case Intrinsic::masked_gather: // fallthrough + case Intrinsic::masked_load: { + // Subtlety: If we load from a pointer, the pointer must be valid + // regardless of whether the element is demanded. Doing otherwise risks + // segfaults which didn't exist in the original program. + APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)), + DemandedPassThrough(DemandedElts); + if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2))) + for (unsigned i = 0; i < VWidth; i++) { + Constant *CElt = CV->getAggregateElement(i); + if (CElt->isNullValue()) + DemandedPtrs.clearBit(i); + else if (CElt->isAllOnesValue()) + DemandedPassThrough.clearBit(i); + } + if (II->getIntrinsicID() == Intrinsic::masked_gather) + simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2); + simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3); + + // Output elements are undefined if the element from both sources are. + // TODO: can strengthen via mask as well. + UndefElts = UndefElts2 & UndefElts3; + break; + } case Intrinsic::x86_xop_vfrcz_ss: case Intrinsic::x86_xop_vfrcz_sd: // The instructions for these intrinsics are speced to zero upper bits not @@ -1639,8 +1679,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts); default: { if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID())) - return simplifyAMDGCNMemoryIntrinsicDemanded( - II, DemandedElts, 0, II->getNumArgOperands() - 2); + return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0); break; } @@ -1667,5 +1706,10 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, UndefElts &= UndefElts2; } + // If we've proven all of the lanes undef, return an undef value. + // TODO: Intersect w/demanded lanes + if (UndefElts.isAllOnesValue()) + return UndefValue::get(I->getType());; + return MadeChange ? I : nullptr; } |