summaryrefslogtreecommitdiff
path: root/lib/Transforms/InstCombine/InstCombineCalls.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/InstCombine/InstCombineCalls.cpp')
-rw-r--r--lib/Transforms/InstCombine/InstCombineCalls.cpp836
1 files changed, 488 insertions, 348 deletions
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 40e52ee755e5..cbfbd8a53993 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -24,6 +24,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
@@ -57,7 +58,6 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
-#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
#include <algorithm>
#include <cassert>
@@ -73,11 +73,11 @@ using namespace PatternMatch;
STATISTIC(NumSimplified, "Number of library calls simplified");
-static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
- "unfold-element-atomic-memcpy-max-elements",
- cl::init(16),
- cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
- "allowed to unfold"));
+static cl::opt<unsigned> GuardWideningWindow(
+ "instcombine-guard-widening-window",
+ cl::init(3),
+ cl::desc("How wide an instruction window to bypass looking for "
+ "another guard"));
/// Return the specified type promoted as it would be to pass though a va_arg
/// area.
@@ -106,97 +106,24 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
return ConstantVector::get(BoolVec);
}
-Instruction *
-InstCombiner::SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI) {
- // Try to unfold this intrinsic into sequence of explicit atomic loads and
- // stores.
- // First check that number of elements is compile time constant.
- auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
- if (!LengthCI)
- return nullptr;
-
- // Check that there are not too many elements.
- uint64_t LengthInBytes = LengthCI->getZExtValue();
- uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
- uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
- if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
- return nullptr;
-
- // Only expand if there are elements to copy.
- if (NumElements > 0) {
- // Don't unfold into illegal integers
- uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
- if (!getDataLayout().isLegalInteger(ElementSizeInBits))
- return nullptr;
-
- // Cast source and destination to the correct type. Intrinsic input
- // arguments are usually represented as i8*. Often operands will be
- // explicitly casted to i8* and we can just strip those casts instead of
- // inserting new ones. However it's easier to rely on other InstCombine
- // rules which will cover trivial cases anyway.
- Value *Src = AMI->getRawSource();
- Value *Dst = AMI->getRawDest();
- Type *ElementPointerType =
- Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
- Src->getType()->getPointerAddressSpace());
-
- Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
- "memcpy_unfold.src_casted");
- Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
- "memcpy_unfold.dst_casted");
-
- for (uint64_t i = 0; i < NumElements; ++i) {
- // Get current element addresses
- ConstantInt *ElementIdxCI =
- ConstantInt::get(AMI->getContext(), APInt(64, i));
- Value *SrcElementAddr =
- Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
- Value *DstElementAddr =
- Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
-
- // Load from the source. Transfer alignment information and mark load as
- // unordered atomic.
- LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
- Load->setOrdering(AtomicOrdering::Unordered);
- // We know alignment of the first element. It is also guaranteed by the
- // verifier that element size is less or equal than first element
- // alignment and both of this values are powers of two. This means that
- // all subsequent accesses are at least element size aligned.
- // TODO: We can infer better alignment but there is no evidence that this
- // will matter.
- Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
- : ElementSizeInBytes);
- Load->setDebugLoc(AMI->getDebugLoc());
-
- // Store loaded value via unordered atomic store.
- StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
- Store->setOrdering(AtomicOrdering::Unordered);
- Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
- : ElementSizeInBytes);
- Store->setDebugLoc(AMI->getDebugLoc());
- }
+Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
+ unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
+ unsigned CopyDstAlign = MI->getDestAlignment();
+ if (CopyDstAlign < DstAlign){
+ MI->setDestAlignment(DstAlign);
+ return MI;
}
- // Set the number of elements of the copy to 0, it will be deleted on the
- // next iteration.
- AMI->setLength(Constant::getNullValue(LengthCI->getType()));
- return AMI;
-}
-
-Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
- unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
- unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
- unsigned MinAlign = std::min(DstAlign, SrcAlign);
- unsigned CopyAlign = MI->getAlignment();
-
- if (CopyAlign < MinAlign) {
- MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false));
+ unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
+ unsigned CopySrcAlign = MI->getSourceAlignment();
+ if (CopySrcAlign < SrcAlign) {
+ MI->setSourceAlignment(SrcAlign);
return MI;
}
// If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
// load/store.
- ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
+ ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
if (!MemOpLength) return nullptr;
// Source and destination pointer types are always "i8*" for intrinsic. See
@@ -222,7 +149,9 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
// If the memcpy has metadata describing the members, see if we can get the
// TBAA tag describing our copy.
MDNode *CopyMD = nullptr;
- if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+ if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
+ CopyMD = M;
+ } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
if (M->getNumOperands() == 3 && M->getOperand(0) &&
mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
@@ -234,15 +163,11 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
CopyMD = cast<MDNode>(M->getOperand(2));
}
- // If the memcpy/memmove provides better alignment info than we can
- // infer, use it.
- SrcAlign = std::max(SrcAlign, CopyAlign);
- DstAlign = std::max(DstAlign, CopyAlign);
-
Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
- LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
- L->setAlignment(SrcAlign);
+ LoadInst *L = Builder.CreateLoad(Src);
+ // Alignment from the mem intrinsic will be better, so use it.
+ L->setAlignment(CopySrcAlign);
if (CopyMD)
L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
MDNode *LoopMemParallelMD =
@@ -250,23 +175,34 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
if (LoopMemParallelMD)
L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
- StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
- S->setAlignment(DstAlign);
+ StoreInst *S = Builder.CreateStore(L, Dest);
+ // Alignment from the mem intrinsic will be better, so use it.
+ S->setAlignment(CopyDstAlign);
if (CopyMD)
S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
if (LoopMemParallelMD)
S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+ if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
+ // non-atomics can be volatile
+ L->setVolatile(MT->isVolatile());
+ S->setVolatile(MT->isVolatile());
+ }
+ if (isa<AtomicMemTransferInst>(MI)) {
+ // atomics have to be unordered
+ L->setOrdering(AtomicOrdering::Unordered);
+ S->setOrdering(AtomicOrdering::Unordered);
+ }
+
// Set the size of the copy to 0, it will be deleted on the next iteration.
- MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
+ MI->setLength(Constant::getNullValue(MemOpLength->getType()));
return MI;
}
-Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
+Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
- if (MI->getAlignment() < Alignment) {
- MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
- Alignment, false));
+ if (MI->getDestAlignment() < Alignment) {
+ MI->setDestAlignment(Alignment);
return MI;
}
@@ -276,7 +212,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
return nullptr;
uint64_t Len = LenC->getLimitedValue();
- Alignment = MI->getAlignment();
+ Alignment = MI->getDestAlignment();
assert(Len && "0-sized memory setting should be removed already.");
// memset(s,c,n) -> store s, c (for n=1,2,4,8)
@@ -296,6 +232,8 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
MI->isVolatile());
S->setAlignment(Alignment);
+ if (isa<AtomicMemSetInst>(MI))
+ S->setOrdering(AtomicOrdering::Unordered);
// Set the size of the copy to 0, it will be deleted on the next iteration.
MI->setLength(Constant::getNullValue(LenC->getType()));
@@ -563,55 +501,6 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
return Builder.CreateAShr(Vec, ShiftVec);
}
-static Value *simplifyX86muldq(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- Value *Arg0 = II.getArgOperand(0);
- Value *Arg1 = II.getArgOperand(1);
- Type *ResTy = II.getType();
- assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
- Arg1->getType()->getScalarSizeInBits() == 32 &&
- ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");
-
- // muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
- if (isa<UndefValue>(Arg0) || isa<UndefValue>(Arg1))
- return ConstantAggregateZero::get(ResTy);
-
- // Constant folding.
- // PMULDQ = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
- // vXi64 sext(shuffle<0,2,..>(Arg1))))
- // PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
- // vXi64 zext(shuffle<0,2,..>(Arg1))))
- if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
- return nullptr;
-
- unsigned NumElts = ResTy->getVectorNumElements();
- assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
- Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
- "Unexpected muldq/muludq types");
-
- unsigned IntrinsicID = II.getIntrinsicID();
- bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID ||
- Intrinsic::x86_avx2_pmul_dq == IntrinsicID ||
- Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);
-
- SmallVector<unsigned, 16> ShuffleMask;
- for (unsigned i = 0; i != NumElts; ++i)
- ShuffleMask.push_back(i * 2);
-
- auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
- auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);
-
- if (IsSigned) {
- LHS = Builder.CreateSExt(LHS, ResTy);
- RHS = Builder.CreateSExt(RHS, ResTy);
- } else {
- LHS = Builder.CreateZExt(LHS, ResTy);
- RHS = Builder.CreateZExt(RHS, ResTy);
- }
-
- return Builder.CreateMul(LHS, RHS);
-}
-
static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
@@ -687,6 +576,105 @@ static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
return ConstantVector::get(Vals);
}
+// Replace X86-specific intrinsics with generic floor-ceil where applicable.
+static Value *simplifyX86round(IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ ConstantInt *Arg = nullptr;
+ Intrinsic::ID IntrinsicID = II.getIntrinsicID();
+
+ if (IntrinsicID == Intrinsic::x86_sse41_round_ss ||
+ IntrinsicID == Intrinsic::x86_sse41_round_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(2));
+ else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(4));
+ else
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(1));
+ if (!Arg)
+ return nullptr;
+ unsigned RoundControl = Arg->getZExtValue();
+
+ Arg = nullptr;
+ unsigned SAE = 0;
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(4));
+ else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd)
+ Arg = dyn_cast<ConstantInt>(II.getArgOperand(5));
+ else
+ SAE = 4;
+ if (!SAE) {
+ if (!Arg)
+ return nullptr;
+ SAE = Arg->getZExtValue();
+ }
+
+ if (SAE != 4 || (RoundControl != 2 /*ceil*/ && RoundControl != 1 /*floor*/))
+ return nullptr;
+
+ Value *Src, *Dst, *Mask;
+ bool IsScalar = false;
+ if (IntrinsicID == Intrinsic::x86_sse41_round_ss ||
+ IntrinsicID == Intrinsic::x86_sse41_round_sd ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ IsScalar = true;
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ Mask = II.getArgOperand(3);
+ Value *Zero = Constant::getNullValue(Mask->getType());
+ Mask = Builder.CreateAnd(Mask, 1);
+ Mask = Builder.CreateICmp(ICmpInst::ICMP_NE, Mask, Zero);
+ Dst = II.getArgOperand(2);
+ } else
+ Dst = II.getArgOperand(0);
+ Src = Builder.CreateExtractElement(II.getArgOperand(1), (uint64_t)0);
+ } else {
+ Src = II.getArgOperand(0);
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_128 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_256 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_128 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_256 ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512) {
+ Dst = II.getArgOperand(2);
+ Mask = II.getArgOperand(3);
+ } else {
+ Dst = Src;
+ Mask = ConstantInt::getAllOnesValue(
+ Builder.getIntNTy(Src->getType()->getVectorNumElements()));
+ }
+ }
+
+ Intrinsic::ID ID = (RoundControl == 2) ? Intrinsic::ceil : Intrinsic::floor;
+ Value *Res = Builder.CreateIntrinsic(ID, {Src}, &II);
+ if (!IsScalar) {
+ if (auto *C = dyn_cast<Constant>(Mask))
+ if (C->isAllOnesValue())
+ return Res;
+ auto *MaskTy = VectorType::get(
+ Builder.getInt1Ty(), cast<IntegerType>(Mask->getType())->getBitWidth());
+ Mask = Builder.CreateBitCast(Mask, MaskTy);
+ unsigned Width = Src->getType()->getVectorNumElements();
+ if (MaskTy->getVectorNumElements() > Width) {
+ uint32_t Indices[4];
+ for (unsigned i = 0; i != Width; ++i)
+ Indices[i] = i;
+ Mask = Builder.CreateShuffleVector(Mask, Mask,
+ makeArrayRef(Indices, Width));
+ }
+ return Builder.CreateSelect(Mask, Res, Dst);
+ }
+ if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss ||
+ IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) {
+ Dst = Builder.CreateExtractElement(Dst, (uint64_t)0);
+ Res = Builder.CreateSelect(Mask, Res, Dst);
+ Dst = II.getArgOperand(0);
+ }
+ return Builder.CreateInsertElement(Dst, Res, (uint64_t)0);
+}
+
static Value *simplifyX86movmsk(const IntrinsicInst &II) {
Value *Arg = II.getArgOperand(0);
Type *ResTy = II.getType();
@@ -1145,36 +1133,6 @@ static Value *simplifyX86vpcom(const IntrinsicInst &II,
return nullptr;
}
-// Emit a select instruction and appropriate bitcasts to help simplify
-// masked intrinsics.
-static Value *emitX86MaskSelect(Value *Mask, Value *Op0, Value *Op1,
- InstCombiner::BuilderTy &Builder) {
- unsigned VWidth = Op0->getType()->getVectorNumElements();
-
- // If the mask is all ones we don't need the select. But we need to check
- // only the bit thats will be used in case VWidth is less than 8.
- if (auto *C = dyn_cast<ConstantInt>(Mask))
- if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue())
- return Op0;
-
- auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
- cast<IntegerType>(Mask->getType())->getBitWidth());
- Mask = Builder.CreateBitCast(Mask, MaskTy);
-
- // If we have less than 8 elements, then the starting mask was an i8 and
- // we need to extract down to the right number of elements.
- if (VWidth < 8) {
- uint32_t Indices[4];
- for (unsigned i = 0; i != VWidth; ++i)
- Indices[i] = i;
- Mask = Builder.CreateShuffleVector(Mask, Mask,
- makeArrayRef(Indices, VWidth),
- "extract");
- }
-
- return Builder.CreateSelect(Mask, Op0, Op1);
-}
-
static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
@@ -1308,6 +1266,40 @@ static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {
return nullptr;
}
+/// This function transforms launder.invariant.group and strip.invariant.group
+/// like:
+/// launder(launder(%x)) -> launder(%x) (the result is not the argument)
+/// launder(strip(%x)) -> launder(%x)
+/// strip(strip(%x)) -> strip(%x) (the result is not the argument)
+/// strip(launder(%x)) -> strip(%x)
+/// This is legal because it preserves the most recent information about
+/// the presence or absence of invariant.group.
+static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
+ InstCombiner &IC) {
+ auto *Arg = II.getArgOperand(0);
+ auto *StrippedArg = Arg->stripPointerCasts();
+ auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
+ if (StrippedArg == StrippedInvariantGroupsArg)
+ return nullptr; // No launders/strips to remove.
+
+ Value *Result = nullptr;
+
+ if (II.getIntrinsicID() == Intrinsic::launder_invariant_group)
+ Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg);
+ else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group)
+ Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg);
+ else
+ llvm_unreachable(
+ "simplifyInvariantGroupIntrinsic only handles launder and strip");
+ if (Result->getType()->getPointerAddressSpace() !=
+ II.getType()->getPointerAddressSpace())
+ Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType());
+ if (Result->getType() != II.getType())
+ Result = IC.Builder.CreateBitCast(Result, II.getType());
+
+ return cast<Instruction>(Result);
+}
+
static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
// If the mask is all zeros, a scatter does nothing.
auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
@@ -1498,6 +1490,68 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
return maxnum(Src0, Src1);
}
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+static Value *simplifyNeonTbl1(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ // Bail out if the mask is not a constant.
+ auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!C)
+ return nullptr;
+
+ auto *VecTy = cast<VectorType>(II.getType());
+ unsigned NumElts = VecTy->getNumElements();
+
+ // Only perform this transformation for <8 x i8> vector types.
+ if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+ return nullptr;
+
+ uint32_t Indexes[8];
+
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = C->getAggregateElement(I);
+
+ if (!COp || !isa<ConstantInt>(COp))
+ return nullptr;
+
+ Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+
+ // Make sure the mask indices are in range.
+ if (Indexes[I] >= NumElts)
+ return nullptr;
+ }
+
+ auto *ShuffleMask = ConstantDataVector::get(II.getContext(),
+ makeArrayRef(Indexes));
+ auto *V1 = II.getArgOperand(0);
+ auto *V2 = Constant::getNullValue(V1->getType());
+ return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+}
+
+/// Convert a vector load intrinsic into a simple llvm load instruction.
+/// This is beneficial when the underlying object being addressed comes
+/// from a constant, since we get constant-folding for free.
+static Value *simplifyNeonVld1(const IntrinsicInst &II,
+ unsigned MemAlign,
+ InstCombiner::BuilderTy &Builder) {
+ auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
+
+ if (!IntrAlign)
+ return nullptr;
+
+ unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ?
+ MemAlign : IntrAlign->getLimitedValue();
+
+ if (!isPowerOf2_32(Alignment))
+ return nullptr;
+
+ auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
+ PointerType::get(II.getType(), 0));
+ return Builder.CreateAlignedLoad(BCastInst, Alignment);
+}
+
// Returns true iff the 2 intrinsics have the same operands, limiting the
// comparison to the first NumOperands.
static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -1820,7 +1874,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// Intrinsics cannot occur in an invoke, so handle them here instead of in
// visitCallSite.
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+ if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
bool Changed = false;
// memmove/cpy/set of zero bytes is a noop.
@@ -1837,17 +1891,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
// No other transformations apply to volatile transfers.
- if (MI->isVolatile())
- return nullptr;
+ if (auto *M = dyn_cast<MemIntrinsic>(MI))
+ if (M->isVolatile())
+ return nullptr;
// If we have a memmove and the source operation is a constant global,
// then the source and dest pointers can't alias, so we can change this
// into a call to memcpy.
- if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+ if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
if (GVSrc->isConstant()) {
Module *M = CI.getModule();
- Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+ Intrinsic::ID MemCpyID =
+ isa<AtomicMemMoveInst>(MMI)
+ ? Intrinsic::memcpy_element_unordered_atomic
+ : Intrinsic::memcpy;
Type *Tys[3] = { CI.getArgOperand(0)->getType(),
CI.getArgOperand(1)->getType(),
CI.getArgOperand(2)->getType() };
@@ -1856,7 +1914,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
}
- if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+ if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
// memmove(x,x,size) -> noop.
if (MTI->getSource() == MTI->getDest())
return eraseInstFromFunction(CI);
@@ -1864,26 +1922,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// If we can determine a pointer alignment that is bigger than currently
// set, update the alignment.
- if (isa<MemTransferInst>(MI)) {
- if (Instruction *I = SimplifyMemTransfer(MI))
+ if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+ if (Instruction *I = SimplifyAnyMemTransfer(MTI))
return I;
- } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
- if (Instruction *I = SimplifyMemSet(MSI))
+ } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
+ if (Instruction *I = SimplifyAnyMemSet(MSI))
return I;
}
if (Changed) return II;
}
- if (auto *AMI = dyn_cast<AtomicMemCpyInst>(II)) {
- if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
- if (C->isNullValue())
- return eraseInstFromFunction(*AMI);
-
- if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
- return I;
- }
-
if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
return I;
@@ -1925,7 +1974,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return simplifyMaskedGather(*II, *this);
case Intrinsic::masked_scatter:
return simplifyMaskedScatter(*II, *this);
-
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::strip_invariant_group:
+ if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this))
+ return replaceInstUsesWith(*II, SkippedBarrier);
+ break;
case Intrinsic::powi:
if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
// 0 and 1 are handled in instsimplify
@@ -1991,8 +2044,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
II->setArgOperand(1, Arg0);
return II;
}
+
+ // FIXME: Simplifications should be in instsimplify.
if (Value *V = simplifyMinnumMaxnum(*II))
return replaceInstUsesWith(*II, V);
+
+ Value *X, *Y;
+ if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
+ (Arg0->hasOneUse() || Arg1->hasOneUse())) {
+ // If both operands are negated, invert the call and negate the result:
+ // minnum(-X, -Y) --> -(maxnum(X, Y))
+ // maxnum(-X, -Y) --> -(minnum(X, Y))
+ Intrinsic::ID NewIID = II->getIntrinsicID() == Intrinsic::maxnum ?
+ Intrinsic::minnum : Intrinsic::maxnum;
+ Value *NewCall = Builder.CreateIntrinsic(NewIID, { X, Y }, II);
+ Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall);
+ FNeg->copyIRFlags(II);
+ return FNeg;
+ }
break;
}
case Intrinsic::fmuladd: {
@@ -2013,37 +2082,34 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *Src0 = II->getArgOperand(0);
Value *Src1 = II->getArgOperand(1);
- // Canonicalize constants into the RHS.
+ // Canonicalize constant multiply operand to Src1.
if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
II->setArgOperand(0, Src1);
II->setArgOperand(1, Src0);
std::swap(Src0, Src1);
}
- Value *LHS = nullptr;
- Value *RHS = nullptr;
-
// fma fneg(x), fneg(y), z -> fma x, y, z
- if (match(Src0, m_FNeg(m_Value(LHS))) &&
- match(Src1, m_FNeg(m_Value(RHS)))) {
- II->setArgOperand(0, LHS);
- II->setArgOperand(1, RHS);
+ Value *X, *Y;
+ if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
+ II->setArgOperand(0, X);
+ II->setArgOperand(1, Y);
return II;
}
// fma fabs(x), fabs(x), z -> fma x, x, z
- if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
- match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
- II->setArgOperand(0, LHS);
- II->setArgOperand(1, RHS);
+ if (match(Src0, m_FAbs(m_Value(X))) &&
+ match(Src1, m_FAbs(m_Specific(X)))) {
+ II->setArgOperand(0, X);
+ II->setArgOperand(1, X);
return II;
}
// fma x, 1, z -> fadd x, z
if (match(Src1, m_FPOne())) {
- Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
- RI->copyFastMathFlags(II);
- return RI;
+ auto *FAdd = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
+ FAdd->copyFastMathFlags(II);
+ return FAdd;
}
break;
@@ -2067,17 +2133,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::rint:
case Intrinsic::trunc: {
Value *ExtSrc;
- if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
- II->getArgOperand(0)->hasOneUse()) {
- // fabs (fpext x) -> fpext (fabs x)
- Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(),
- { ExtSrc->getType() });
- CallInst *NewFabs = Builder.CreateCall(F, ExtSrc);
- NewFabs->copyFastMathFlags(II);
- NewFabs->takeName(II);
- return new FPExtInst(NewFabs, II->getType());
+ if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) {
+ // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x)
+ Value *NarrowII = Builder.CreateIntrinsic(II->getIntrinsicID(),
+ { ExtSrc }, II);
+ return new FPExtInst(NarrowII, II->getType());
}
-
break;
}
case Intrinsic::cos:
@@ -2085,7 +2146,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *SrcSrc;
Value *Src = II->getArgOperand(0);
if (match(Src, m_FNeg(m_Value(SrcSrc))) ||
- match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) {
+ match(Src, m_FAbs(m_Value(SrcSrc)))) {
// cos(-x) -> cos(x)
// cos(fabs(x)) -> cos(x)
II->setArgOperand(0, SrcSrc);
@@ -2298,6 +2359,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::x86_sse41_round_ps:
+ case Intrinsic::x86_sse41_round_pd:
+ case Intrinsic::x86_avx_round_ps_256:
+ case Intrinsic::x86_avx_round_pd_256:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_128:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_256:
+ case Intrinsic::x86_avx512_mask_rndscale_ps_512:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_128:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_256:
+ case Intrinsic::x86_avx512_mask_rndscale_pd_512:
+ case Intrinsic::x86_avx512_mask_rndscale_ss:
+ case Intrinsic::x86_avx512_mask_rndscale_sd:
+ if (Value *V = simplifyX86round(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+
case Intrinsic::x86_mmx_pmovmskb:
case Intrinsic::x86_sse_movmsk_ps:
case Intrinsic::x86_sse2_movmsk_pd:
@@ -2355,16 +2432,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return II;
break;
}
- case Intrinsic::x86_avx512_mask_cmp_pd_128:
- case Intrinsic::x86_avx512_mask_cmp_pd_256:
- case Intrinsic::x86_avx512_mask_cmp_pd_512:
- case Intrinsic::x86_avx512_mask_cmp_ps_128:
- case Intrinsic::x86_avx512_mask_cmp_ps_256:
- case Intrinsic::x86_avx512_mask_cmp_ps_512: {
+ case Intrinsic::x86_avx512_cmp_pd_128:
+ case Intrinsic::x86_avx512_cmp_pd_256:
+ case Intrinsic::x86_avx512_cmp_pd_512:
+ case Intrinsic::x86_avx512_cmp_ps_128:
+ case Intrinsic::x86_avx512_cmp_ps_256:
+ case Intrinsic::x86_avx512_cmp_ps_512: {
// Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
Value *Arg0 = II->getArgOperand(0);
Value *Arg1 = II->getArgOperand(1);
- bool Arg0IsZero = match(Arg0, m_Zero());
+ bool Arg0IsZero = match(Arg0, m_PosZeroFP());
if (Arg0IsZero)
std::swap(Arg0, Arg1);
Value *A, *B;
@@ -2376,7 +2453,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// The compare intrinsic uses the above assumptions and therefore
// doesn't require additional flags.
if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
- match(Arg1, m_Zero()) && isa<Instruction>(Arg0) &&
+ match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) &&
cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
if (Arg0IsZero)
std::swap(A, B);
@@ -2387,17 +2464,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
- case Intrinsic::x86_avx512_mask_add_ps_512:
- case Intrinsic::x86_avx512_mask_div_ps_512:
- case Intrinsic::x86_avx512_mask_mul_ps_512:
- case Intrinsic::x86_avx512_mask_sub_ps_512:
- case Intrinsic::x86_avx512_mask_add_pd_512:
- case Intrinsic::x86_avx512_mask_div_pd_512:
- case Intrinsic::x86_avx512_mask_mul_pd_512:
- case Intrinsic::x86_avx512_mask_sub_pd_512:
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
+ case Intrinsic::x86_avx512_div_pd_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
// IR operations.
- if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
+ if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
if (R->getValue() == 4) {
Value *Arg0 = II->getArgOperand(0);
Value *Arg1 = II->getArgOperand(1);
@@ -2405,27 +2482,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *V;
switch (II->getIntrinsicID()) {
default: llvm_unreachable("Case stmts out of sync!");
- case Intrinsic::x86_avx512_mask_add_ps_512:
- case Intrinsic::x86_avx512_mask_add_pd_512:
+ case Intrinsic::x86_avx512_add_ps_512:
+ case Intrinsic::x86_avx512_add_pd_512:
V = Builder.CreateFAdd(Arg0, Arg1);
break;
- case Intrinsic::x86_avx512_mask_sub_ps_512:
- case Intrinsic::x86_avx512_mask_sub_pd_512:
+ case Intrinsic::x86_avx512_sub_ps_512:
+ case Intrinsic::x86_avx512_sub_pd_512:
V = Builder.CreateFSub(Arg0, Arg1);
break;
- case Intrinsic::x86_avx512_mask_mul_ps_512:
- case Intrinsic::x86_avx512_mask_mul_pd_512:
+ case Intrinsic::x86_avx512_mul_ps_512:
+ case Intrinsic::x86_avx512_mul_pd_512:
V = Builder.CreateFMul(Arg0, Arg1);
break;
- case Intrinsic::x86_avx512_mask_div_ps_512:
- case Intrinsic::x86_avx512_mask_div_pd_512:
+ case Intrinsic::x86_avx512_div_ps_512:
+ case Intrinsic::x86_avx512_div_pd_512:
V = Builder.CreateFDiv(Arg0, Arg1);
break;
}
- // Create a select for the masking.
- V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
- Builder);
return replaceInstUsesWith(*II, V);
}
}
@@ -2499,32 +2573,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_avx512_mask_min_ss_round:
case Intrinsic::x86_avx512_mask_max_sd_round:
case Intrinsic::x86_avx512_mask_min_sd_round:
- case Intrinsic::x86_avx512_mask_vfmadd_ss:
- case Intrinsic::x86_avx512_mask_vfmadd_sd:
- case Intrinsic::x86_avx512_maskz_vfmadd_ss:
- case Intrinsic::x86_avx512_maskz_vfmadd_sd:
- case Intrinsic::x86_avx512_mask3_vfmadd_ss:
- case Intrinsic::x86_avx512_mask3_vfmadd_sd:
- case Intrinsic::x86_avx512_mask3_vfmsub_ss:
- case Intrinsic::x86_avx512_mask3_vfmsub_sd:
- case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
- case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
- case Intrinsic::x86_fma_vfmadd_ss:
- case Intrinsic::x86_fma_vfmsub_ss:
- case Intrinsic::x86_fma_vfnmadd_ss:
- case Intrinsic::x86_fma_vfnmsub_ss:
- case Intrinsic::x86_fma_vfmadd_sd:
- case Intrinsic::x86_fma_vfmsub_sd:
- case Intrinsic::x86_fma_vfnmadd_sd:
- case Intrinsic::x86_fma_vfnmsub_sd:
case Intrinsic::x86_sse_cmp_ss:
case Intrinsic::x86_sse_min_ss:
case Intrinsic::x86_sse_max_ss:
case Intrinsic::x86_sse2_cmp_sd:
case Intrinsic::x86_sse2_min_sd:
case Intrinsic::x86_sse2_max_sd:
- case Intrinsic::x86_sse41_round_ss:
- case Intrinsic::x86_sse41_round_sd:
case Intrinsic::x86_xop_vfrcz_ss:
case Intrinsic::x86_xop_vfrcz_sd: {
unsigned VWidth = II->getType()->getVectorNumElements();
@@ -2537,6 +2591,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
break;
}
+ case Intrinsic::x86_sse41_round_ss:
+ case Intrinsic::x86_sse41_round_sd: {
+ unsigned VWidth = II->getType()->getVectorNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+ if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
+ if (V != II)
+ return replaceInstUsesWith(*II, V);
+ return II;
+ } else if (Value *V = simplifyX86round(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+ }
// Constant fold ashr( <A x Bi>, Ci ).
// Constant fold lshr( <A x Bi>, Ci ).
@@ -2647,26 +2714,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, V);
break;
- case Intrinsic::x86_sse2_pmulu_dq:
- case Intrinsic::x86_sse41_pmuldq:
- case Intrinsic::x86_avx2_pmul_dq:
- case Intrinsic::x86_avx2_pmulu_dq:
- case Intrinsic::x86_avx512_pmul_dq_512:
- case Intrinsic::x86_avx512_pmulu_dq_512: {
- if (Value *V = simplifyX86muldq(*II, Builder))
- return replaceInstUsesWith(*II, V);
-
- unsigned VWidth = II->getType()->getVectorNumElements();
- APInt UndefElts(VWidth, 0);
- APInt DemandedElts = APInt::getAllOnesValue(VWidth);
- if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
- if (V != II)
- return replaceInstUsesWith(*II, V);
- return II;
- }
- break;
- }
-
case Intrinsic::x86_sse2_packssdw_128:
case Intrinsic::x86_sse2_packsswb_128:
case Intrinsic::x86_avx2_packssdw:
@@ -2687,7 +2734,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, V);
break;
- case Intrinsic::x86_pclmulqdq: {
+ case Intrinsic::x86_pclmulqdq:
+ case Intrinsic::x86_pclmulqdq_256:
+ case Intrinsic::x86_pclmulqdq_512: {
if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
unsigned Imm = C->getZExtValue();
@@ -2695,27 +2744,28 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *Arg0 = II->getArgOperand(0);
Value *Arg1 = II->getArgOperand(1);
unsigned VWidth = Arg0->getType()->getVectorNumElements();
- APInt DemandedElts(VWidth, 0);
APInt UndefElts1(VWidth, 0);
- DemandedElts = (Imm & 0x01) ? 2 : 1;
- if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts,
+ APInt DemandedElts1 = APInt::getSplat(VWidth,
+ APInt(2, (Imm & 0x01) ? 2 : 1));
+ if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1,
UndefElts1)) {
II->setArgOperand(0, V);
MadeChange = true;
}
APInt UndefElts2(VWidth, 0);
- DemandedElts = (Imm & 0x10) ? 2 : 1;
- if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts,
+ APInt DemandedElts2 = APInt::getSplat(VWidth,
+ APInt(2, (Imm & 0x10) ? 2 : 1));
+ if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2,
UndefElts2)) {
II->setArgOperand(1, V);
MadeChange = true;
}
- // If both input elements are undef, the result is undef.
- if (UndefElts1[(Imm & 0x01) ? 1 : 0] ||
- UndefElts2[(Imm & 0x10) ? 1 : 0])
+ // If either input elements are undef, the result is zero.
+ if (DemandedElts1.isSubsetOf(UndefElts1) ||
+ DemandedElts2.isSubsetOf(UndefElts2))
return replaceInstUsesWith(*II,
ConstantAggregateZero::get(II->getType()));
@@ -2916,32 +2966,22 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_avx2_permd:
case Intrinsic::x86_avx2_permps:
+ case Intrinsic::x86_avx512_permvar_df_256:
+ case Intrinsic::x86_avx512_permvar_df_512:
+ case Intrinsic::x86_avx512_permvar_di_256:
+ case Intrinsic::x86_avx512_permvar_di_512:
+ case Intrinsic::x86_avx512_permvar_hi_128:
+ case Intrinsic::x86_avx512_permvar_hi_256:
+ case Intrinsic::x86_avx512_permvar_hi_512:
+ case Intrinsic::x86_avx512_permvar_qi_128:
+ case Intrinsic::x86_avx512_permvar_qi_256:
+ case Intrinsic::x86_avx512_permvar_qi_512:
+ case Intrinsic::x86_avx512_permvar_sf_512:
+ case Intrinsic::x86_avx512_permvar_si_512:
if (Value *V = simplifyX86vpermv(*II, Builder))
return replaceInstUsesWith(*II, V);
break;
- case Intrinsic::x86_avx512_mask_permvar_df_256:
- case Intrinsic::x86_avx512_mask_permvar_df_512:
- case Intrinsic::x86_avx512_mask_permvar_di_256:
- case Intrinsic::x86_avx512_mask_permvar_di_512:
- case Intrinsic::x86_avx512_mask_permvar_hi_128:
- case Intrinsic::x86_avx512_mask_permvar_hi_256:
- case Intrinsic::x86_avx512_mask_permvar_hi_512:
- case Intrinsic::x86_avx512_mask_permvar_qi_128:
- case Intrinsic::x86_avx512_mask_permvar_qi_256:
- case Intrinsic::x86_avx512_mask_permvar_qi_512:
- case Intrinsic::x86_avx512_mask_permvar_sf_256:
- case Intrinsic::x86_avx512_mask_permvar_sf_512:
- case Intrinsic::x86_avx512_mask_permvar_si_256:
- case Intrinsic::x86_avx512_mask_permvar_si_512:
- if (Value *V = simplifyX86vpermv(*II, Builder)) {
- // We simplified the permuting, now create a select for the masking.
- V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
- Builder);
- return replaceInstUsesWith(*II, V);
- }
- break;
-
case Intrinsic::x86_avx_maskload_ps:
case Intrinsic::x86_avx_maskload_pd:
case Intrinsic::x86_avx_maskload_ps_256:
@@ -3042,7 +3082,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
break;
- case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld1: {
+ unsigned MemAlign = getKnownAlignment(II->getArgOperand(0),
+ DL, II, &AC, &DT);
+ if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+ }
+
case Intrinsic::arm_neon_vld2:
case Intrinsic::arm_neon_vld3:
case Intrinsic::arm_neon_vld4:
@@ -3069,6 +3116,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::arm_neon_vtbl1:
+ case Intrinsic::aarch64_neon_tbl1:
+ if (Value *V = simplifyNeonTbl1(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+
case Intrinsic::arm_neon_vmulls:
case Intrinsic::arm_neon_vmullu:
case Intrinsic::aarch64_neon_smull:
@@ -3107,6 +3160,23 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::arm_neon_aesd:
+ case Intrinsic::arm_neon_aese:
+ case Intrinsic::aarch64_crypto_aesd:
+ case Intrinsic::aarch64_crypto_aese: {
+ Value *DataArg = II->getArgOperand(0);
+ Value *KeyArg = II->getArgOperand(1);
+
+ // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
+ Value *Data, *Key;
+ if (match(KeyArg, m_ZeroInt()) &&
+ match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
+ II->setArgOperand(0, Data);
+ II->setArgOperand(1, Key);
+ return II;
+ }
+ break;
+ }
case Intrinsic::amdgcn_rcp: {
Value *Src = II->getArgOperand(0);
@@ -3264,6 +3334,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
}
+ case Intrinsic::amdgcn_cvt_pknorm_i16:
+ case Intrinsic::amdgcn_cvt_pknorm_u16:
+ case Intrinsic::amdgcn_cvt_pk_i16:
+ case Intrinsic::amdgcn_cvt_pk_u16: {
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+
+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+ return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+ break;
+ }
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe: {
// Decompose simple cases into standard shifts.
@@ -3370,6 +3452,24 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
Value *Src1 = II->getArgOperand(1);
Value *Src2 = II->getArgOperand(2);
+ // Checking for NaN before canonicalization provides better fidelity when
+ // mapping other operations onto fmed3 since the order of operands is
+ // unchanged.
+ CallInst *NewCall = nullptr;
+ if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) {
+ NewCall = Builder.CreateMinNum(Src1, Src2);
+ } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) {
+ NewCall = Builder.CreateMinNum(Src0, Src2);
+ } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
+ NewCall = Builder.CreateMaxNum(Src0, Src1);
+ }
+
+ if (NewCall) {
+ NewCall->copyFastMathFlags(II);
+ NewCall->takeName(II);
+ return replaceInstUsesWith(*II, NewCall);
+ }
+
bool Swap = false;
// Canonicalize constants to RHS operands.
//
@@ -3396,13 +3496,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return II;
}
- if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
- CallInst *NewCall = Builder.CreateMinNum(Src0, Src1);
- NewCall->copyFastMathFlags(II);
- NewCall->takeName(II);
- return replaceInstUsesWith(*II, NewCall);
- }
-
if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
@@ -3536,13 +3629,32 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// amdgcn.kill(i1 1) is a no-op
return eraseInstFromFunction(CI);
}
+ case Intrinsic::amdgcn_update_dpp: {
+ Value *Old = II->getArgOperand(0);
+
+ auto BC = dyn_cast<ConstantInt>(II->getArgOperand(5));
+ auto RM = dyn_cast<ConstantInt>(II->getArgOperand(3));
+ auto BM = dyn_cast<ConstantInt>(II->getArgOperand(4));
+ if (!BC || !RM || !BM ||
+ BC->isZeroValue() ||
+ RM->getZExtValue() != 0xF ||
+ BM->getZExtValue() != 0xF ||
+ isa<UndefValue>(Old))
+ break;
+
+ // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
+ II->setOperand(0, UndefValue::get(Old->getType()));
+ return II;
+ }
case Intrinsic::stackrestore: {
// If the save is right next to the restore, remove the restore. This can
// happen when variable allocas are DCE'd.
if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
if (SS->getIntrinsicID() == Intrinsic::stacksave) {
- if (&*++SS->getIterator() == II)
+ // Skip over debug info.
+ if (SS->getNextNonDebugInstruction() == II) {
return eraseInstFromFunction(CI);
+ }
}
}
@@ -3597,9 +3709,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
break;
case Intrinsic::assume: {
Value *IIOperand = II->getArgOperand(0);
- // Remove an assume if it is immediately followed by an identical assume.
- if (match(II->getNextNode(),
- m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
+ // Remove an assume if it is followed by an identical assume.
+ // TODO: Do we need this? Unless there are conflicting assumptions, the
+ // computeKnownBits(IIOperand) below here eliminates redundant assumes.
+ Instruction *Next = II->getNextNonDebugInstruction();
+ if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
return eraseInstFromFunction(CI);
// Canonicalize assume(a && b) -> assume(a); assume(b);
@@ -3686,8 +3800,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
}
case Intrinsic::experimental_guard: {
- // Is this guard followed by another guard?
+ // Is this guard followed by another guard? We scan forward over a small
+ // fixed window of instructions to handle common cases with conditions
+ // computed between guards.
Instruction *NextInst = II->getNextNode();
+ for (unsigned i = 0; i < GuardWideningWindow; i++) {
+ // Note: Using context-free form to avoid compile time blow up
+ if (!isSafeToSpeculativelyExecute(NextInst))
+ break;
+ NextInst = NextInst->getNextNode();
+ }
Value *NextCond = nullptr;
if (match(NextInst,
m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
@@ -3698,6 +3820,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return eraseInstFromFunction(*NextInst);
// Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+ Instruction* MoveI = II->getNextNode();
+ while (MoveI != NextInst) {
+ auto *Temp = MoveI;
+ MoveI = MoveI->getNextNode();
+ Temp->moveBefore(II);
+ }
II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
return eraseInstFromFunction(*NextInst);
}
@@ -3710,7 +3838,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// Fence instruction simplification
Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
// Remove identical consecutive fences.
- if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
+ Instruction *Next = FI.getNextNonDebugInstruction();
+ if (auto *NFI = dyn_cast<FenceInst>(Next))
if (FI.isIdenticalTo(NFI))
return eraseInstFromFunction(FI);
return nullptr;
@@ -3887,8 +4016,8 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
// Remove the convergent attr on calls when the callee is not convergent.
if (CS.isConvergent() && !CalleeF->isConvergent() &&
!CalleeF->isIntrinsic()) {
- DEBUG(dbgs() << "Removing convergent attr from instr "
- << CS.getInstruction() << "\n");
+ LLVM_DEBUG(dbgs() << "Removing convergent attr from instr "
+ << CS.getInstruction() << "\n");
CS.setNotConvergent();
return CS.getInstruction();
}
@@ -3919,7 +4048,9 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
}
}
- if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+ if ((isa<ConstantPointerNull>(Callee) &&
+ !NullPointerIsDefined(CS.getInstruction()->getFunction())) ||
+ isa<UndefValue>(Callee)) {
// If CS does not return void then replaceAllUsesWith undef.
// This allows ValueHandlers and custom metadata to adjust itself.
if (!CS.getInstruction()->getType()->isVoidTy())
@@ -3986,10 +4117,19 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
if (!Callee)
return false;
- // The prototype of a thunk is a lie. Don't directly call such a function.
+ // If this is a call to a thunk function, don't remove the cast. Thunks are
+ // used to transparently forward all incoming parameters and outgoing return
+ // values, so it's important to leave the cast in place.
if (Callee->hasFnAttribute("thunk"))
return false;
+ // If this is a musttail call, the callee's prototype must match the caller's
+ // prototype with the exception of pointee types. The code below doesn't
+ // implement that, so we can't do this transform.
+ // TODO: Do the transform if it only requires adding pointer casts.
+ if (CS.isMustTailCall())
+ return false;
+
Instruction *Caller = CS.getInstruction();
const AttributeList &CallerPAL = CS.getAttributes();